35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
141 template<
typename MT1
143 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
144 ,
private MatMatMultExpr
145 ,
private Computation
173 template<
typename T1,
typename T2,
typename T3 >
174 struct IsEvaluationRequired {
175 enum { value = ( evaluateLeft || evaluateRight ) };
185 template<
typename T1,
typename T2,
typename T3 >
186 struct UseBlasKernel {
188 HasMutableDataAccess<T1>::value &&
189 HasConstDataAccess<T2>::value &&
190 HasConstDataAccess<T3>::value &&
191 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
192 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
193 IsBlasCompatible<typename T1::ElementType>::value &&
194 IsBlasCompatible<typename T2::ElementType>::value &&
195 IsBlasCompatible<typename T3::ElementType>::value &&
196 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
197 IsSame< typename T1::ElementType, typename T3::ElementType >::value };
207 template<
typename T1,
typename T2,
typename T3 >
208 struct UseVectorizedDefaultKernel {
210 !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
211 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
212 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
213 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
214 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
215 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
216 IntrinsicTrait<typename T1::ElementType>::addition &&
217 IntrinsicTrait<typename T1::ElementType>::subtraction &&
218 IntrinsicTrait<typename T1::ElementType>::multiplication };
250 MT1::vectorizable && MT2::vectorizable &&
256 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
257 !evaluateRight && MT2::smpAssignable };
300 :(
lhs_.columns() ) ) );
302 if(
lhs_.columns() == 0UL ||
312 const size_t knum( kend - kbegin );
313 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
315 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
317 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
319 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
337 inline ReturnType
at(
size_t i,
size_t j )
const {
338 if( i >=
lhs_.rows() ) {
341 if( j >=
rhs_.columns() ) {
364 return rhs_.columns();
394 template<
typename T >
396 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
406 template<
typename T >
408 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
418 return lhs_.isAligned() &&
rhs_.isAligned();
429 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
430 (
columns() > SMP_TDMATDMATMULT_THRESHOLD );
453 template<
typename MT
462 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
465 else if( rhs.lhs_.columns() == 0UL ) {
470 LT A(
serial( rhs.lhs_ ) );
471 RT B(
serial( rhs.rhs_ ) );
480 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
496 template<
typename MT3
499 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
502 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
503 selectSmallAssignKernel( C, A, B );
505 selectBlasAssignKernel( C, A, B );
524 template<
typename MT3
527 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
528 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
530 const size_t M( A.rows() );
531 const size_t N( B.columns() );
532 const size_t K( A.columns() );
534 for(
size_t i=0UL; i<M; ++i )
536 const size_t kbegin( ( IsUpper<MT4>::value )
537 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
539 const size_t kend( ( IsLower<MT4>::value )
540 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
544 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
545 for(
size_t j=0UL; j<N; ++j ) {
552 const size_t jbegin( ( IsUpper<MT5>::value )
553 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
555 const size_t jend( ( IsLower<MT5>::value )
556 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
560 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
561 for(
size_t j=0UL; j<jbegin; ++j ) {
565 else if( IsStrictlyUpper<MT5>::value ) {
566 reset( (~C)(i,0UL) );
568 for(
size_t j=jbegin; j<jend; ++j ) {
569 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
571 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
572 for(
size_t j=jend; j<N; ++j ) {
576 else if( IsStrictlyLower<MT5>::value ) {
577 reset( (~C)(i,N-1UL) );
581 for(
size_t k=kbegin+1UL; k<kend; ++k )
583 const size_t jbegin( ( IsUpper<MT5>::value )
584 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
586 const size_t jend( ( IsLower<MT5>::value )
587 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
591 for(
size_t j=jbegin; j<jend; ++j ) {
592 (~C)(i,j) += A(i,k) * B(k,j);
594 if( IsLower<MT5>::value ) {
595 (~C)(i,jend) = A(i,k) * B(k,jend);
617 template<
typename MT3
620 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
621 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
623 const size_t M( A.rows() );
624 const size_t N( B.columns() );
625 const size_t K( A.columns() );
627 for(
size_t j=0UL; j<N; ++j )
629 const size_t kbegin( ( IsLower<MT5>::value )
630 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
632 const size_t kend( ( IsUpper<MT5>::value )
633 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
637 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
638 for(
size_t i=0UL; i<M; ++i ) {
645 const size_t ibegin( ( IsLower<MT4>::value )
646 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
648 const size_t iend( ( IsUpper<MT4>::value )
649 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
653 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
654 for(
size_t i=0UL; i<ibegin; ++i ) {
658 else if( IsStrictlyLower<MT4>::value ) {
659 reset( (~C)(0UL,j) );
661 for(
size_t i=ibegin; i<iend; ++i ) {
662 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
664 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
665 for(
size_t i=iend; i<M; ++i ) {
669 else if( IsStrictlyUpper<MT4>::value ) {
670 reset( (~C)(M-1UL,j) );
674 for(
size_t k=kbegin+1UL; k<kend; ++k )
676 const size_t ibegin( ( IsLower<MT4>::value )
677 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
679 const size_t iend( ( IsUpper<MT4>::value )
680 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
684 for(
size_t i=ibegin; i<iend; ++i ) {
685 (~C)(i,j) += A(i,k) * B(k,j);
687 if( IsUpper<MT4>::value ) {
688 (~C)(iend,j) = A(iend,k) * B(k,j);
710 template<
typename MT3
713 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
714 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
716 const size_t M( A.rows() );
717 const size_t N( B.columns() );
719 const size_t block( BLOCK_SIZE );
721 for(
size_t ii=0UL; ii<M; ii+=block ) {
722 const size_t iend(
min( M, ii+block ) );
723 for(
size_t jj=0UL; jj<N; jj+=block ) {
724 const size_t jend(
min( N, jj+block ) );
725 for(
size_t i=ii; i<iend; ++i )
727 const size_t jbegin( ( IsUpper<MT4>::value )
728 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
730 const size_t jpos( ( IsLower<MT4>::value )
731 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
734 if( IsUpper<MT4>::value ) {
735 for(
size_t j=jj; j<jbegin; ++j ) {
739 for(
size_t j=jbegin; j<jpos; ++j ) {
740 (~C)(i,j) = A(i,j) * B(j,j);
742 if( IsLower<MT4>::value ) {
743 for(
size_t j=jpos; j<jend; ++j ) {
768 template<
typename MT3
771 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
772 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
774 const size_t M( A.rows() );
775 const size_t N( B.columns() );
777 for(
size_t j=0UL; j<N; ++j )
779 const size_t ibegin( ( IsLower<MT4>::value )
780 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
782 const size_t iend( ( IsUpper<MT4>::value )
783 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
787 if( IsLower<MT4>::value ) {
788 for(
size_t i=0UL; i<ibegin; ++i ) {
792 for(
size_t i=ibegin; i<iend; ++i ) {
793 (~C)(i,j) = A(i,j) * B(j,j);
795 if( IsUpper<MT4>::value ) {
796 for(
size_t i=iend; i<M; ++i ) {
819 template<
typename MT3
822 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
823 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
825 const size_t M( A.rows() );
826 const size_t N( B.columns() );
828 for(
size_t i=0UL; i<M; ++i )
830 const size_t jbegin( ( IsUpper<MT5>::value )
831 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
833 const size_t jend( ( IsLower<MT5>::value )
834 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
838 if( IsUpper<MT5>::value ) {
839 for(
size_t j=0UL; j<jbegin; ++j ) {
843 for(
size_t j=jbegin; j<jend; ++j ) {
844 (~C)(i,j) = A(i,i) * B(i,j);
846 if( IsLower<MT5>::value ) {
847 for(
size_t j=jend; j<N; ++j ) {
870 template<
typename MT3
873 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
874 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
876 const size_t M( A.rows() );
877 const size_t N( B.columns() );
879 const size_t block( BLOCK_SIZE );
881 for(
size_t jj=0UL; jj<N; jj+=block ) {
882 const size_t jend(
min( N, jj+block ) );
883 for(
size_t ii=0UL; ii<M; ii+=block ) {
884 const size_t iend(
min( M, ii+block ) );
885 for(
size_t j=jj; j<jend; ++j )
887 const size_t ibegin( ( IsLower<MT5>::value )
888 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
890 const size_t ipos( ( IsUpper<MT5>::value )
891 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
894 if( IsLower<MT5>::value ) {
895 for(
size_t i=ii; i<ibegin; ++i ) {
899 for(
size_t i=ibegin; i<ipos; ++i ) {
900 (~C)(i,j) = A(i,i) * B(i,j);
902 if( IsUpper<MT5>::value ) {
903 for(
size_t i=ipos; i<iend; ++i ) {
928 template<
typename MT3
931 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
932 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
936 for(
size_t i=0UL; i<A.rows(); ++i ) {
937 C(i,i) = A(i,i) * B(i,i);
957 template<
typename MT3
960 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
961 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
963 selectDefaultAssignKernel( ~C, A, B );
983 template<
typename MT3
986 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
987 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
989 typedef IntrinsicTrait<ElementType> IT;
991 const size_t M( A.rows() );
992 const size_t N( B.columns() );
993 const size_t K( A.columns() );
995 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
997 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
1003 for(
size_t i=0UL; i<M; ++i )
1005 const size_t kbegin( ( IsUpper<MT4>::value )
1006 ?( ( IsLower<MT5>::value )
1007 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1008 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1009 :( IsLower<MT5>::value ? j : 0UL ) );
1010 const size_t kend( ( IsLower<MT4>::value )
1011 ?( ( IsUpper<MT5>::value )
1012 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
1013 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1014 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
1016 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1018 for(
size_t k=kbegin; k<kend; ++k ) {
1019 const IntrinsicType a1(
set( A(i,k) ) );
1020 xmm1 = xmm1 + a1 * B.load(k,j );
1021 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1022 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1023 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1024 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
1025 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
1026 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
1027 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
1030 (~C).store( i, j , xmm1 );
1031 (~C).store( i, j+
IT::size , xmm2 );
1032 (~C).store( i, j+
IT::size*2UL, xmm3 );
1033 (~C).store( i, j+
IT::size*3UL, xmm4 );
1034 (~C).store( i, j+
IT::size*4UL, xmm5 );
1035 (~C).store( i, j+
IT::size*5UL, xmm6 );
1036 (~C).store( i, j+
IT::size*6UL, xmm7 );
1037 (~C).store( i, j+
IT::size*7UL, xmm8 );
1045 for( ; (i+2UL) <= M; i+=2UL )
1047 const size_t kbegin( ( IsUpper<MT4>::value )
1048 ?( ( IsLower<MT5>::value )
1049 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1050 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1051 :( IsLower<MT5>::value ? j : 0UL ) );
1052 const size_t kend( ( IsLower<MT4>::value )
1053 ?( ( IsUpper<MT5>::value )
1054 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
1055 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1056 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
1058 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1060 for(
size_t k=kbegin; k<kend; ++k ) {
1061 const IntrinsicType a1(
set( A(i ,k) ) );
1062 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1063 const IntrinsicType b1( B.load(k,j ) );
1064 const IntrinsicType b2( B.load(k,j+
IT::size ) );
1065 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
1066 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
1067 xmm1 = xmm1 + a1 * b1;
1068 xmm2 = xmm2 + a1 * b2;
1069 xmm3 = xmm3 + a1 * b3;
1070 xmm4 = xmm4 + a1 * b4;
1071 xmm5 = xmm5 + a2 * b1;
1072 xmm6 = xmm6 + a2 * b2;
1073 xmm7 = xmm7 + a2 * b3;
1074 xmm8 = xmm8 + a2 * b4;
1077 (~C).store( i , j , xmm1 );
1078 (~C).store( i , j+
IT::size , xmm2 );
1079 (~C).store( i , j+
IT::size*2UL, xmm3 );
1080 (~C).store( i , j+
IT::size*3UL, xmm4 );
1081 (~C).store( i+1UL, j , xmm5 );
1082 (~C).store( i+1UL, j+
IT::size , xmm6 );
1083 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
1084 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
1089 const size_t kbegin( ( IsUpper<MT4>::value )
1090 ?( ( IsLower<MT5>::value )
1091 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1092 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1093 :( IsLower<MT5>::value ? j : 0UL ) );
1094 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
1096 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1098 for(
size_t k=kbegin; k<kend; ++k ) {
1099 const IntrinsicType a1(
set( A(i,k) ) );
1100 xmm1 = xmm1 + a1 * B.load(k,j );
1101 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1102 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1103 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1106 (~C).store( i, j , xmm1 );
1107 (~C).store( i, j+
IT::size , xmm2 );
1108 (~C).store( i, j+
IT::size*2UL, xmm3 );
1109 (~C).store( i, j+
IT::size*3UL, xmm4 );
1117 for( ; (i+2UL) <= M; i+=2UL )
1119 const size_t kbegin( ( IsUpper<MT4>::value )
1120 ?( ( IsLower<MT5>::value )
1121 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1122 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1123 :( IsLower<MT5>::value ? j : 0UL ) );
1124 const size_t kend( ( IsLower<MT4>::value )
1125 ?( ( IsUpper<MT5>::value )
1126 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
1127 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1128 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
1130 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1132 for(
size_t k=kbegin; k<kend; ++k ) {
1133 const IntrinsicType a1(
set( A(i ,k) ) );
1134 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1135 const IntrinsicType b1( B.load(k,j ) );
1136 const IntrinsicType b2( B.load(k,j+
IT::size) );
1137 xmm1 = xmm1 + a1 * b1;
1138 xmm2 = xmm2 + a1 * b2;
1139 xmm3 = xmm3 + a2 * b1;
1140 xmm4 = xmm4 + a2 * b2;
1143 (~C).store( i , j , xmm1 );
1144 (~C).store( i , j+
IT::size, xmm2 );
1145 (~C).store( i+1UL, j , xmm3 );
1146 (~C).store( i+1UL, j+
IT::size, xmm4 );
1151 const size_t kbegin( ( IsUpper<MT4>::value )
1152 ?( ( IsLower<MT5>::value )
1153 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1154 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1155 :( IsLower<MT5>::value ? j : 0UL ) );
1156 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
1158 IntrinsicType xmm1, xmm2;
1160 for(
size_t k=kbegin; k<kend; ++k ) {
1161 const IntrinsicType a1(
set( A(i,k) ) );
1162 xmm1 = xmm1 + a1 * B.load(k,j );
1163 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
1166 (~C).store( i, j , xmm1 );
1175 for( ; (i+2UL) <= M; i+=2UL )
1177 const size_t kbegin( ( IsUpper<MT4>::value )
1178 ?( ( IsLower<MT5>::value )
1179 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1180 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1181 :( IsLower<MT5>::value ? j : 0UL ) );
1182 const size_t kend( ( IsLower<MT4>::value )
1183 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1186 IntrinsicType xmm1, xmm2;
1188 for(
size_t k=kbegin; k<kend; ++k ) {
1189 const IntrinsicType b1( B.load(k,j) );
1190 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1191 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1194 (~C).store( i , j, xmm1 );
1195 (~C).store( i+1UL, j, xmm2 );
1200 const size_t kbegin( ( IsUpper<MT4>::value )
1201 ?( ( IsLower<MT5>::value )
1202 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1203 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1204 :( IsLower<MT5>::value ? j : 0UL ) );
1208 for(
size_t k=kbegin; k<K; ++k ) {
1209 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1212 (~C).store( i, j, xmm1 );
1216 for( ; remainder && j<N; ++j )
1220 for( ; (i+2UL) <= M; i+=2UL )
1222 const size_t kbegin( ( IsUpper<MT4>::value )
1223 ?( ( IsLower<MT5>::value )
1224 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1225 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1226 :( IsLower<MT5>::value ? j : 0UL ) );
1227 const size_t kend( ( IsLower<MT4>::value )
1228 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1234 for(
size_t k=kbegin; k<kend; ++k ) {
1235 value1 += A(i ,k) * B(k,j);
1236 value2 += A(i+1UL,k) * B(k,j);
1239 (~C)(i ,j) = value1;
1240 (~C)(i+1UL,j) = value2;
1245 const size_t kbegin( ( IsUpper<MT4>::value )
1246 ?( ( IsLower<MT5>::value )
1247 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1248 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1249 :( IsLower<MT5>::value ? j : 0UL ) );
1253 for(
size_t k=kbegin; k<K; ++k ) {
1254 value += A(i,k) * B(k,j);
1279 template<
typename MT3
1282 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1283 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1285 typedef IntrinsicTrait<ElementType> IT;
1287 const size_t M( A.rows() );
1288 const size_t N( B.columns() );
1289 const size_t K( A.columns() );
1291 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1293 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
1299 for(
size_t j=0UL; j<N; ++j )
1301 const size_t kbegin( ( IsLower<MT5>::value )
1302 ?( ( IsUpper<MT4>::value )
1303 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1304 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1305 :( IsUpper<MT4>::value ? i : 0UL ) );
1306 const size_t kend( ( IsUpper<MT5>::value )
1307 ?( ( IsLower<MT4>::value )
1308 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1309 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1310 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
1312 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1314 for(
size_t k=kbegin; k<kend; ++k ) {
1315 const IntrinsicType b1(
set( B(k,j) ) );
1316 xmm1 = xmm1 + A.load(i ,k) * b1;
1317 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1318 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1319 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1320 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
1321 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
1322 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
1323 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
1326 (~C).store( i , j, xmm1 );
1327 (~C).store( i+
IT::size , j, xmm2 );
1328 (~C).store( i+
IT::size*2UL, j, xmm3 );
1329 (~C).store( i+
IT::size*3UL, j, xmm4 );
1330 (~C).store( i+
IT::size*4UL, j, xmm5 );
1331 (~C).store( i+
IT::size*5UL, j, xmm6 );
1332 (~C).store( i+
IT::size*6UL, j, xmm7 );
1333 (~C).store( i+
IT::size*7UL, j, xmm8 );
1341 for( ; (j+2UL) <= N; j+=2UL )
1343 const size_t kbegin( ( IsLower<MT5>::value )
1344 ?( ( IsUpper<MT4>::value )
1345 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1346 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1347 :( IsUpper<MT4>::value ? i : 0UL ) );
1348 const size_t kend( ( IsUpper<MT5>::value )
1349 ?( ( IsLower<MT4>::value )
1350 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1351 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1352 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
1354 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1356 for(
size_t k=kbegin; k<kend; ++k ) {
1357 const IntrinsicType a1( A.load(i ,k) );
1358 const IntrinsicType a2( A.load(i+
IT::size ,k) );
1359 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
1360 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
1361 const IntrinsicType b1(
set( B(k,j ) ) );
1362 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1363 xmm1 = xmm1 + a1 * b1;
1364 xmm2 = xmm2 + a2 * b1;
1365 xmm3 = xmm3 + a3 * b1;
1366 xmm4 = xmm4 + a4 * b1;
1367 xmm5 = xmm5 + a1 * b2;
1368 xmm6 = xmm6 + a2 * b2;
1369 xmm7 = xmm7 + a3 * b2;
1370 xmm8 = xmm8 + a4 * b2;
1373 (~C).store( i , j , xmm1 );
1374 (~C).store( i+
IT::size , j , xmm2 );
1375 (~C).store( i+
IT::size*2UL, j , xmm3 );
1376 (~C).store( i+
IT::size*3UL, j , xmm4 );
1377 (~C).store( i , j+1UL, xmm5 );
1378 (~C).store( i+
IT::size , j+1UL, xmm6 );
1379 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
1380 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
1385 const size_t kbegin( ( IsLower<MT5>::value )
1386 ?( ( IsUpper<MT4>::value )
1387 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1388 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1389 :( IsUpper<MT4>::value ? i : 0UL ) );
1390 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
1392 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1394 for(
size_t k=kbegin; k<kend; ++k ) {
1395 const IntrinsicType b1(
set( B(k,j) ) );
1396 xmm1 = xmm1 + A.load(i ,k) * b1;
1397 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1398 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1399 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1402 (~C).store( i , j, xmm1 );
1403 (~C).store( i+
IT::size , j, xmm2 );
1404 (~C).store( i+
IT::size*2UL, j, xmm3 );
1405 (~C).store( i+
IT::size*3UL, j, xmm4 );
1413 for( ; (j+2UL) <= N; j+=2UL )
1415 const size_t kbegin( ( IsLower<MT5>::value )
1416 ?( ( IsUpper<MT4>::value )
1417 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1418 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1419 :( IsUpper<MT4>::value ? i : 0UL ) );
1420 const size_t kend( ( IsUpper<MT5>::value )
1421 ?( ( IsLower<MT4>::value )
1422 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1423 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1424 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
1426 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1428 for(
size_t k=kbegin; k<kend; ++k ) {
1429 const IntrinsicType a1( A.load(i ,k) );
1430 const IntrinsicType a2( A.load(i+
IT::size,k) );
1431 const IntrinsicType b1(
set( B(k,j ) ) );
1432 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1433 xmm1 = xmm1 + a1 * b1;
1434 xmm2 = xmm2 + a2 * b1;
1435 xmm3 = xmm3 + a1 * b2;
1436 xmm4 = xmm4 + a2 * b2;
1439 (~C).store( i , j , xmm1 );
1440 (~C).store( i+
IT::size, j , xmm2 );
1441 (~C).store( i , j+1UL, xmm3 );
1442 (~C).store( i+
IT::size, j+1UL, xmm4 );
1447 const size_t kbegin( ( IsLower<MT5>::value )
1448 ?( ( IsUpper<MT4>::value )
1449 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1450 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1451 :( IsUpper<MT4>::value ? i : 0UL ) );
1452 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
1454 IntrinsicType xmm1, xmm2;
1456 for(
size_t k=kbegin; k<kend; ++k ) {
1457 const IntrinsicType b1(
set( B(k,j) ) );
1458 xmm1 = xmm1 + A.load(i ,k) * b1;
1459 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1462 (~C).store( i , j, xmm1 );
1471 for( ; (j+2UL) <= N; j+=2UL )
1473 const size_t kbegin( ( IsLower<MT5>::value )
1474 ?( ( IsUpper<MT4>::value )
1475 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1476 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1477 :( IsUpper<MT4>::value ? i : 0UL ) );
1478 const size_t kend( ( IsUpper<MT5>::value )
1479 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1482 IntrinsicType xmm1, xmm2;
1484 for(
size_t k=kbegin; k<kend; ++k ) {
1485 const IntrinsicType a1( A.load(i,k) );
1486 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1487 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1490 (~C).store( i, j , xmm1 );
1491 (~C).store( i, j+1UL, xmm2 );
1496 const size_t kbegin( ( IsLower<MT5>::value )
1497 ?( ( IsUpper<MT4>::value )
1498 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1499 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1500 :( IsUpper<MT4>::value ? i : 0UL ) );
1504 for(
size_t k=kbegin; k<K; ++k ) {
1505 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1508 (~C).store( i, j, xmm1 );
1512 for( ; remainder && i<M; ++i )
1516 for( ; (j+2UL) <= N; j+=2UL )
1518 const size_t kbegin( ( IsLower<MT5>::value )
1519 ?( ( IsUpper<MT4>::value )
1520 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1521 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1522 :( IsUpper<MT4>::value ? i : 0UL ) );
1523 const size_t kend( ( IsUpper<MT5>::value )
1524 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1530 for(
size_t k=kbegin; k<kend; ++k ) {
1531 value1 += A(i,k) * B(k,j );
1532 value2 += A(i,k) * B(k,j+1UL);
1535 (~C)(i,j ) = value1;
1536 (~C)(i,j+1UL) = value2;
1541 const size_t kbegin( ( IsLower<MT5>::value )
1542 ?( ( IsUpper<MT4>::value )
1543 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1544 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1545 :( IsUpper<MT4>::value ? i : 0UL ) );
1549 for(
size_t k=kbegin; k<K; ++k ) {
1550 value += A(i,k) * B(k,j);
1574 template<
typename MT3
1577 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1578 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1580 selectDefaultAssignKernel( C, A, B );
1600 template<
typename MT3
1603 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1604 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1606 typedef IntrinsicTrait<ElementType> IT;
1608 const size_t M( A.rows() );
1609 const size_t N( B.columns() );
1610 const size_t K( A.columns() );
1612 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1614 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
1616 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1618 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1621 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
1623 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1625 for(
size_t i=ii; i<iend; ++i ) {
1626 for(
size_t j=jj; j<jend; ++j ) {
1631 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
1633 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1645 for( ; (i+2UL) <= iend; i+=2UL )
1647 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1648 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1649 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1650 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1652 IntrinsicType xmm1( (~C).load(i ,j ) );
1653 IntrinsicType xmm2( (~C).load(i ,j1) );
1654 IntrinsicType xmm3( (~C).load(i ,j2) );
1655 IntrinsicType xmm4( (~C).load(i ,j3) );
1656 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1657 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
1658 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
1659 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
1661 for(
size_t k=kbegin; k<kend; ++k ) {
1662 const IntrinsicType a1(
set( A(i ,k) ) );
1663 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1664 const IntrinsicType b1( B.load(k,j ) );
1665 const IntrinsicType b2( B.load(k,j1) );
1666 const IntrinsicType b3( B.load(k,j2) );
1667 const IntrinsicType b4( B.load(k,j3) );
1668 xmm1 = xmm1 + a1 * b1;
1669 xmm2 = xmm2 + a1 * b2;
1670 xmm3 = xmm3 + a1 * b3;
1671 xmm4 = xmm4 + a1 * b4;
1672 xmm5 = xmm5 + a2 * b1;
1673 xmm6 = xmm6 + a2 * b2;
1674 xmm7 = xmm7 + a2 * b3;
1675 xmm8 = xmm8 + a2 * b4;
1678 (~C).store( i , j , xmm1 );
1679 (~C).store( i , j1, xmm2 );
1680 (~C).store( i , j2, xmm3 );
1681 (~C).store( i , j3, xmm4 );
1682 (~C).store( i+1UL, j , xmm5 );
1683 (~C).store( i+1UL, j1, xmm6 );
1684 (~C).store( i+1UL, j2, xmm7 );
1685 (~C).store( i+1UL, j3, xmm8 );
1690 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1691 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1692 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1693 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1695 IntrinsicType xmm1( (~C).load(i,j ) );
1696 IntrinsicType xmm2( (~C).load(i,j1) );
1697 IntrinsicType xmm3( (~C).load(i,j2) );
1698 IntrinsicType xmm4( (~C).load(i,j3) );
1700 for(
size_t k=kbegin; k<kend; ++k ) {
1701 const IntrinsicType a1(
set( A(i,k) ) );
1702 xmm1 = xmm1 + a1 * B.load(k,j );
1703 xmm2 = xmm2 + a1 * B.load(k,j1);
1704 xmm3 = xmm3 + a1 * B.load(k,j2);
1705 xmm4 = xmm4 + a1 * B.load(k,j3);
1708 (~C).store( i, j , xmm1 );
1709 (~C).store( i, j1, xmm2 );
1710 (~C).store( i, j2, xmm3 );
1711 (~C).store( i, j3, xmm4 );
1721 for( ; (i+4UL) <= iend; i+=4UL )
1723 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1724 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1725 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1726 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1728 IntrinsicType xmm1( (~C).load(i ,j ) );
1729 IntrinsicType xmm2( (~C).load(i ,j1) );
1730 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1731 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1732 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
1733 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
1734 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
1735 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
1737 for(
size_t k=kbegin; k<kend; ++k ) {
1738 const IntrinsicType a1(
set( A(i ,k) ) );
1739 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1740 const IntrinsicType a3(
set( A(i+2UL,k) ) );
1741 const IntrinsicType a4(
set( A(i+3UL,k) ) );
1742 const IntrinsicType b1( B.load(k,j ) );
1743 const IntrinsicType b2( B.load(k,j1) );
1744 xmm1 = xmm1 + a1 * b1;
1745 xmm2 = xmm2 + a1 * b2;
1746 xmm3 = xmm3 + a2 * b1;
1747 xmm4 = xmm4 + a2 * b2;
1748 xmm5 = xmm5 + a3 * b1;
1749 xmm6 = xmm6 + a3 * b2;
1750 xmm7 = xmm7 + a4 * b1;
1751 xmm8 = xmm8 + a4 * b2;
1754 (~C).store( i , j , xmm1 );
1755 (~C).store( i , j1, xmm2 );
1756 (~C).store( i+1UL, j , xmm3 );
1757 (~C).store( i+1UL, j1, xmm4 );
1758 (~C).store( i+2UL, j , xmm5 );
1759 (~C).store( i+2UL, j1, xmm6 );
1760 (~C).store( i+3UL, j , xmm7 );
1761 (~C).store( i+3UL, j1, xmm8 );
1764 for( ; (i+2UL) <= iend; i+=2UL )
1766 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1767 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1768 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1769 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1771 IntrinsicType xmm1( (~C).load(i ,j ) );
1772 IntrinsicType xmm2( (~C).load(i ,j1) );
1773 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1774 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1776 for(
size_t k=kbegin; k<kend; ++k ) {
1777 const IntrinsicType a1(
set( A(i ,k) ) );
1778 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1779 const IntrinsicType b1( B.load(k,j ) );
1780 const IntrinsicType b2( B.load(k,j1) );
1781 xmm1 = xmm1 + a1 * b1;
1782 xmm2 = xmm2 + a1 * b2;
1783 xmm3 = xmm3 + a2 * b1;
1784 xmm4 = xmm4 + a2 * b2;
1787 (~C).store( i , j , xmm1 );
1788 (~C).store( i , j1, xmm2 );
1789 (~C).store( i+1UL, j , xmm3 );
1790 (~C).store( i+1UL, j1, xmm4 );
1795 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1796 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1797 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1798 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1800 IntrinsicType xmm1( (~C).load(i,j ) );
1801 IntrinsicType xmm2( (~C).load(i,j1) );
1803 for(
size_t k=kbegin; k<kend; ++k ) {
1804 const IntrinsicType a1(
set( A(i,k) ) );
1805 xmm1 = xmm1 + a1 * B.load(k,j );
1806 xmm2 = xmm2 + a1 * B.load(k,j1);
1809 (~C).store( i, j , xmm1 );
1810 (~C).store( i, j1, xmm2 );
1816 for(
size_t i=ii; i<iend; ++i )
1818 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1819 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1820 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1821 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
1823 IntrinsicType xmm1( (~C).load(i,j) );
1825 for(
size_t k=kbegin; k<kend; ++k ) {
1826 const IntrinsicType a1(
set( A(i,k) ) );
1827 xmm1 = xmm1 + a1 * B.load(k,j);
1830 (~C).store( i, j, xmm1 );
1834 for( ; remainder && j<jend; ++j )
1836 for(
size_t i=ii; i<iend; ++i )
1838 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1839 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1840 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1841 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
1843 ElementType value( (~C)(i,j) );
1845 for(
size_t k=kbegin; k<kend; ++k ) {
1846 value += A(i,k) * B(k,j);
1874 template<
typename MT3
1877 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1878 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1880 typedef IntrinsicTrait<ElementType> IT;
1882 const size_t M( A.rows() );
1883 const size_t N( B.columns() );
1884 const size_t K( A.columns() );
1886 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1888 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
1890 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1892 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1895 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
1897 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1899 for(
size_t j=jj; j<jend; ++j ) {
1900 for(
size_t i=ii; i<iend; ++i ) {
1905 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
1907 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1919 for( ; (j+2UL) <= jend; j+=2UL )
1921 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1922 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1923 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1924 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1926 IntrinsicType xmm1( (~C).load(i ,j ) );
1927 IntrinsicType xmm2( (~C).load(i1,j ) );
1928 IntrinsicType xmm3( (~C).load(i2,j ) );
1929 IntrinsicType xmm4( (~C).load(i3,j ) );
1930 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1931 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
1932 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
1933 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
1935 for(
size_t k=kbegin; k<kend; ++k ) {
1936 const IntrinsicType a1( A.load(i ,k) );
1937 const IntrinsicType a2( A.load(i1,k) );
1938 const IntrinsicType a3( A.load(i2,k) );
1939 const IntrinsicType a4( A.load(i3,k) );
1940 const IntrinsicType b1(
set( B(k,j ) ) );
1941 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1942 xmm1 = xmm1 + a1 * b1;
1943 xmm2 = xmm2 + a2 * b1;
1944 xmm3 = xmm3 + a3 * b1;
1945 xmm4 = xmm4 + a4 * b1;
1946 xmm5 = xmm5 + a1 * b2;
1947 xmm6 = xmm6 + a2 * b2;
1948 xmm7 = xmm7 + a3 * b2;
1949 xmm8 = xmm8 + a4 * b2;
1952 (~C).store( i , j , xmm1 );
1953 (~C).store( i1, j , xmm2 );
1954 (~C).store( i2, j , xmm3 );
1955 (~C).store( i3, j , xmm4 );
1956 (~C).store( i , j+1UL, xmm5 );
1957 (~C).store( i1, j+1UL, xmm6 );
1958 (~C).store( i2, j+1UL, xmm7 );
1959 (~C).store( i3, j+1UL, xmm8 );
1964 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1965 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1966 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1967 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1969 IntrinsicType xmm1( (~C).load(i ,j) );
1970 IntrinsicType xmm2( (~C).load(i1,j) );
1971 IntrinsicType xmm3( (~C).load(i2,j) );
1972 IntrinsicType xmm4( (~C).load(i3,j) );
1974 for(
size_t k=kbegin; k<kend; ++k ) {
1975 const IntrinsicType b1(
set( B(k,j) ) );
1976 xmm1 = xmm1 + A.load(i ,k) * b1;
1977 xmm2 = xmm2 + A.load(i1,k) * b1;
1978 xmm3 = xmm3 + A.load(i2,k) * b1;
1979 xmm4 = xmm4 + A.load(i3,k) * b1;
1982 (~C).store( i , j, xmm1 );
1983 (~C).store( i1, j, xmm2 );
1984 (~C).store( i2, j, xmm3 );
1985 (~C).store( i3, j, xmm4 );
1995 for( ; (j+4UL) <= jend; j+=4UL )
1997 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1998 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1999 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2000 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2002 IntrinsicType xmm1( (~C).load(i ,j ) );
2003 IntrinsicType xmm2( (~C).load(i1,j ) );
2004 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2005 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2006 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
2007 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
2008 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
2009 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
2011 for(
size_t k=kbegin; k<kend; ++k ) {
2012 const IntrinsicType a1( A.load(i ,k) );
2013 const IntrinsicType a2( A.load(i1,k) );
2014 const IntrinsicType b1(
set( B(k,j ) ) );
2015 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2016 const IntrinsicType b3(
set( B(k,j+2UL) ) );
2017 const IntrinsicType b4(
set( B(k,j+3UL) ) );
2018 xmm1 = xmm1 + a1 * b1;
2019 xmm2 = xmm2 + a2 * b1;
2020 xmm3 = xmm3 + a1 * b2;
2021 xmm4 = xmm4 + a2 * b2;
2022 xmm5 = xmm5 + a1 * b3;
2023 xmm6 = xmm6 + a2 * b3;
2024 xmm7 = xmm7 + a1 * b4;
2025 xmm8 = xmm8 + a2 * b4;
2028 (~C).store( i , j , xmm1 );
2029 (~C).store( i1, j , xmm2 );
2030 (~C).store( i , j+1UL, xmm3 );
2031 (~C).store( i1, j+1UL, xmm4 );
2032 (~C).store( i , j+2UL, xmm5 );
2033 (~C).store( i1, j+2UL, xmm6 );
2034 (~C).store( i , j+3UL, xmm7 );
2035 (~C).store( i1, j+3UL, xmm8 );
2038 for( ; (j+2UL) <= jend; j+=2UL )
2040 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2041 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2042 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2043 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2045 IntrinsicType xmm1( (~C).load(i ,j ) );
2046 IntrinsicType xmm2( (~C).load(i1,j ) );
2047 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2048 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2050 for(
size_t k=kbegin; k<kend; ++k ) {
2051 const IntrinsicType a1( A.load(i ,k) );
2052 const IntrinsicType a2( A.load(i1,k) );
2053 const IntrinsicType b1(
set( B(k,j ) ) );
2054 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2055 xmm1 = xmm1 + a1 * b1;
2056 xmm2 = xmm2 + a2 * b1;
2057 xmm3 = xmm3 + a1 * b2;
2058 xmm4 = xmm4 + a2 * b2;
2061 (~C).store( i , j , xmm1 );
2062 (~C).store( i1, j , xmm2 );
2063 (~C).store( i , j+1UL, xmm3 );
2064 (~C).store( i1, j+1UL, xmm4 );
2069 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2070 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2071 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2072 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2074 IntrinsicType xmm1( (~C).load(i ,j) );
2075 IntrinsicType xmm2( (~C).load(i1,j) );
2077 for(
size_t k=kbegin; k<kend; ++k ) {
2078 const IntrinsicType b1(
set( B(k,j) ) );
2079 xmm1 = xmm1 + A.load(i ,k) * b1;
2080 xmm2 = xmm2 + A.load(i1,k) * b1;
2083 (~C).store( i , j, xmm1 );
2084 (~C).store( i1, j, xmm2 );
2090 for(
size_t j=jj; j<jend; ++j )
2092 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2093 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2094 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
2095 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2097 IntrinsicType xmm1( (~C).load(i,j) );
2099 for(
size_t k=kbegin; k<kend; ++k ) {
2100 const IntrinsicType b1(
set( B(k,j) ) );
2101 xmm1 = xmm1 + A.load(i,k) * b1;
2104 (~C).store( i, j, xmm1 );
2108 for( ; remainder && i<iend; ++i )
2110 for(
size_t j=jj; j<jend; ++j )
2112 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2113 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2114 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
2115 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2117 ElementType value( (~C)(i,j) );
2119 for(
size_t k=kbegin; k<kend; ++k ) {
2120 value += A(i,k) * B(k,j);
2147 template<
typename MT3
2150 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2151 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2153 selectLargeAssignKernel( C, A, B );
2173 template<
typename MT3
2176 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2177 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2181 if( IsTriangular<MT4>::value ) {
2183 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2185 else if( IsTriangular<MT5>::value ) {
2187 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2190 gemm( C, A, B, ET(1), ET(0) );
2210 template<
typename MT
2212 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2216 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2228 const TmpType tmp(
serial( rhs ) );
2229 assign( ~lhs, tmp );
2247 template<
typename MT
2249 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2256 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2260 LT A(
serial( rhs.lhs_ ) );
2261 RT B(
serial( rhs.rhs_ ) );
2270 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2286 template<
typename MT3
2289 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2291 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2292 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2293 selectSmallAddAssignKernel( C, A, B );
2295 selectBlasAddAssignKernel( C, A, B );
2314 template<
typename MT3
2317 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2318 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2320 const size_t M( A.rows() );
2321 const size_t N( B.columns() );
2322 const size_t K( A.columns() );
2324 for(
size_t i=0UL; i<M; ++i )
2326 const size_t kbegin( ( IsUpper<MT4>::value )
2327 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2329 const size_t kend( ( IsLower<MT4>::value )
2330 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2334 for(
size_t k=kbegin; k<kend; ++k )
2336 const size_t jbegin( ( IsUpper<MT5>::value )
2337 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2339 const size_t jend( ( IsLower<MT5>::value )
2340 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2344 const size_t jnum( jend - jbegin );
2345 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2347 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2348 (~C)(i,j ) += A(i,k) * B(k,j );
2349 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2352 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2374 template<
typename MT3
2377 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2378 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2380 const size_t M( A.rows() );
2381 const size_t N( B.columns() );
2382 const size_t K( A.columns() );
2384 for(
size_t j=0UL; j<N; ++j )
2386 const size_t kbegin( ( IsLower<MT5>::value )
2387 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2389 const size_t kend( ( IsUpper<MT5>::value )
2390 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2394 for(
size_t k=kbegin; k<kend; ++k )
2396 const size_t ibegin( ( IsLower<MT4>::value )
2397 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2399 const size_t iend( ( IsUpper<MT4>::value )
2400 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2404 const size_t inum( iend - ibegin );
2405 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2407 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2408 (~C)(i ,j) += A(i ,k) * B(k,j);
2409 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2412 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2434 template<
typename MT3
2437 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2438 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2440 const size_t M( A.rows() );
2441 const size_t N( B.columns() );
2443 const size_t block( BLOCK_SIZE );
2445 for(
size_t ii=0UL; ii<M; ii+=block ) {
2446 const size_t iend(
min( M, ii+block ) );
2447 for(
size_t jj=0UL; jj<N; jj+=block ) {
2448 const size_t jend(
min( N, jj+block ) );
2449 for(
size_t i=ii; i<iend; ++i )
2451 const size_t jbegin( ( IsUpper<MT4>::value )
2452 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2454 const size_t jpos( ( IsLower<MT4>::value )
2455 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2458 for(
size_t j=jbegin; j<jpos; ++j ) {
2459 (~C)(i,j) += A(i,j) * B(j,j);
2482 template<
typename MT3
2485 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2486 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2488 const size_t M( A.rows() );
2489 const size_t N( B.columns() );
2491 for(
size_t j=0UL; j<N; ++j )
2493 const size_t ibegin( ( IsLower<MT4>::value )
2494 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2496 const size_t iend( ( IsUpper<MT4>::value )
2497 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2501 const size_t inum( iend - ibegin );
2502 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2504 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2505 (~C)(i ,j) += A(i ,j) * B(j,j);
2506 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2509 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2530 template<
typename MT3
2533 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2534 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2536 const size_t M( A.rows() );
2537 const size_t N( B.columns() );
2539 for(
size_t i=0UL; i<M; ++i )
2541 const size_t jbegin( ( IsUpper<MT5>::value )
2542 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2544 const size_t jend( ( IsLower<MT5>::value )
2545 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2549 const size_t jnum( jend - jbegin );
2550 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2552 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2553 (~C)(i,j ) += A(i,i) * B(i,j );
2554 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2557 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2578 template<
typename MT3
2581 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2582 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2584 const size_t M( A.rows() );
2585 const size_t N( B.columns() );
2587 const size_t block( BLOCK_SIZE );
2589 for(
size_t jj=0UL; jj<N; jj+=block ) {
2590 const size_t jend(
min( N, jj+block ) );
2591 for(
size_t ii=0UL; ii<M; ii+=block ) {
2592 const size_t iend(
min( M, ii+block ) );
2593 for(
size_t j=jj; j<jend; ++j )
2595 const size_t ibegin( ( IsLower<MT5>::value )
2596 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2598 const size_t ipos( ( IsUpper<MT5>::value )
2599 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2602 for(
size_t i=ibegin; i<ipos; ++i ) {
2603 (~C)(i,j) += A(i,i) * B(i,j);
2626 template<
typename MT3
2629 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2630 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2632 for(
size_t i=0UL; i<A.rows(); ++i ) {
2633 C(i,i) += A(i,i) * B(i,i);
2653 template<
typename MT3
2656 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2657 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2659 selectDefaultAddAssignKernel( C, A, B );
2679 template<
typename MT3
2682 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2683 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2685 typedef IntrinsicTrait<ElementType> IT;
2687 const size_t M( A.rows() );
2688 const size_t N( B.columns() );
2689 const size_t K( A.columns() );
2691 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2693 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
2699 for(
size_t i=0UL; i<M; ++i )
2701 const size_t kbegin( ( IsUpper<MT4>::value )
2702 ?( ( IsLower<MT5>::value )
2703 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2704 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2705 :( IsLower<MT5>::value ? j : 0UL ) );
2706 const size_t kend( ( IsLower<MT4>::value )
2707 ?( ( IsUpper<MT5>::value )
2708 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
2709 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2710 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
2712 IntrinsicType xmm1( (~C).load(i,j ) );
2713 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
2714 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
2715 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
2716 IntrinsicType xmm5( (~C).load(i,j+
IT::size*4UL) );
2717 IntrinsicType xmm6( (~C).load(i,j+
IT::size*5UL) );
2718 IntrinsicType xmm7( (~C).load(i,j+
IT::size*6UL) );
2719 IntrinsicType xmm8( (~C).load(i,j+
IT::size*7UL) );
2721 for(
size_t k=kbegin; k<kend; ++k ) {
2722 const IntrinsicType a1(
set( A(i,k) ) );
2723 xmm1 = xmm1 + a1 * B.load(k,j );
2724 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
2725 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
2726 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
2727 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
2728 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
2729 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
2730 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
2733 (~C).store( i, j , xmm1 );
2734 (~C).store( i, j+
IT::size , xmm2 );
2735 (~C).store( i, j+
IT::size*2UL, xmm3 );
2736 (~C).store( i, j+
IT::size*3UL, xmm4 );
2737 (~C).store( i, j+
IT::size*4UL, xmm5 );
2738 (~C).store( i, j+
IT::size*5UL, xmm6 );
2739 (~C).store( i, j+
IT::size*6UL, xmm7 );
2740 (~C).store( i, j+
IT::size*7UL, xmm8 );
2748 for( ; (i+2UL) <= M; i+=2UL )
2750 const size_t kbegin( ( IsUpper<MT4>::value )
2751 ?( ( IsLower<MT5>::value )
2752 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2753 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2754 :( IsLower<MT5>::value ? j : 0UL ) );
2755 const size_t kend( ( IsLower<MT4>::value )
2756 ?( ( IsUpper<MT5>::value )
2757 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
2758 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2759 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
2761 IntrinsicType xmm1( (~C).load(i ,j ) );
2762 IntrinsicType xmm2( (~C).load(i ,j+
IT::size ) );
2763 IntrinsicType xmm3( (~C).load(i ,j+
IT::size*2UL) );
2764 IntrinsicType xmm4( (~C).load(i ,j+
IT::size*3UL) );
2765 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
2766 IntrinsicType xmm6( (~C).load(i+1UL,j+
IT::size ) );
2767 IntrinsicType xmm7( (~C).load(i+1UL,j+
IT::size*2UL) );
2768 IntrinsicType xmm8( (~C).load(i+1UL,j+
IT::size*3UL) );
2770 for(
size_t k=kbegin; k<kend; ++k ) {
2771 const IntrinsicType a1(
set( A(i ,k) ) );
2772 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2773 const IntrinsicType b1( B.load(k,j ) );
2774 const IntrinsicType b2( B.load(k,j+
IT::size ) );
2775 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
2776 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
2777 xmm1 = xmm1 + a1 * b1;
2778 xmm2 = xmm2 + a1 * b2;
2779 xmm3 = xmm3 + a1 * b3;
2780 xmm4 = xmm4 + a1 * b4;
2781 xmm5 = xmm5 + a2 * b1;
2782 xmm6 = xmm6 + a2 * b2;
2783 xmm7 = xmm7 + a2 * b3;
2784 xmm8 = xmm8 + a2 * b4;
2787 (~C).store( i , j , xmm1 );
2788 (~C).store( i , j+
IT::size , xmm2 );
2789 (~C).store( i , j+
IT::size*2UL, xmm3 );
2790 (~C).store( i , j+
IT::size*3UL, xmm4 );
2791 (~C).store( i+1UL, j , xmm5 );
2792 (~C).store( i+1UL, j+
IT::size , xmm6 );
2793 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
2794 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
2799 const size_t kbegin( ( IsUpper<MT4>::value )
2800 ?( ( IsLower<MT5>::value )
2801 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2802 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2803 :( IsLower<MT5>::value ? j : 0UL ) );
2804 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
2806 IntrinsicType xmm1( (~C).load(i,j ) );
2807 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
2808 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
2809 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
2811 for(
size_t k=kbegin; k<kend; ++k ) {
2812 const IntrinsicType a1(
set( A(i,k) ) );
2813 xmm1 = xmm1 + a1 * B.load(k,j );
2814 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
2815 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
2816 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
2819 (~C).store( i, j , xmm1 );
2820 (~C).store( i, j+
IT::size , xmm2 );
2821 (~C).store( i, j+
IT::size*2UL, xmm3 );
2822 (~C).store( i, j+
IT::size*3UL, xmm4 );
2830 for( ; (i+2UL) <= M; i+=2UL )
2832 const size_t kbegin( ( IsUpper<MT4>::value )
2833 ?( ( IsLower<MT5>::value )
2834 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2835 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2836 :( IsLower<MT5>::value ? j : 0UL ) );
2837 const size_t kend( ( IsLower<MT4>::value )
2838 ?( ( IsUpper<MT5>::value )
2839 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
2840 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2841 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
2843 IntrinsicType xmm1( (~C).load(i ,j ) );
2844 IntrinsicType xmm2( (~C).load(i ,j+
IT::size) );
2845 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2846 IntrinsicType xmm4( (~C).load(i+1UL,j+
IT::size) );
2848 for(
size_t k=kbegin; k<kend; ++k ) {
2849 const IntrinsicType a1(
set( A(i ,k) ) );
2850 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2851 const IntrinsicType b1( B.load(k,j ) );
2852 const IntrinsicType b2( B.load(k,j+
IT::size) );
2853 xmm1 = xmm1 + a1 * b1;
2854 xmm2 = xmm2 + a1 * b2;
2855 xmm3 = xmm3 + a2 * b1;
2856 xmm4 = xmm4 + a2 * b2;
2859 (~C).store( i , j , xmm1 );
2860 (~C).store( i , j+
IT::size, xmm2 );
2861 (~C).store( i+1UL, j , xmm3 );
2862 (~C).store( i+1UL, j+
IT::size, xmm4 );
2867 const size_t kbegin( ( IsUpper<MT4>::value )
2868 ?( ( IsLower<MT5>::value )
2869 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2870 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2871 :( IsLower<MT5>::value ? j : 0UL ) );
2872 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
2874 IntrinsicType xmm1( (~C).load(i,j ) );
2875 IntrinsicType xmm2( (~C).load(i,j+
IT::size) );
2877 for(
size_t k=kbegin; k<kend; ++k ) {
2878 const IntrinsicType a1(
set( A(i,k) ) );
2879 xmm1 = xmm1 + a1 * B.load(k,j );
2880 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
2883 (~C).store( i, j , xmm1 );
2892 for( ; (i+2UL) <= M; i+=2UL )
2894 const size_t kbegin( ( IsUpper<MT4>::value )
2895 ?( ( IsLower<MT5>::value )
2896 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2897 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2898 :( IsLower<MT5>::value ? j : 0UL ) );
2899 const size_t kend( ( IsLower<MT4>::value )
2900 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2903 IntrinsicType xmm1( (~C).load(i ,j) );
2904 IntrinsicType xmm2( (~C).load(i+1UL,j) );
2906 for(
size_t k=kbegin; k<kend; ++k ) {
2907 const IntrinsicType b1( B.load(k,j) );
2908 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2909 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2912 (~C).store( i , j, xmm1 );
2913 (~C).store( i+1UL, j, xmm2 );
2918 const size_t kbegin( ( IsUpper<MT4>::value )
2919 ?( ( IsLower<MT5>::value )
2920 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2921 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2922 :( IsLower<MT5>::value ? j : 0UL ) );
2924 IntrinsicType xmm1( (~C).load(i,j) );
2926 for(
size_t k=kbegin; k<K; ++k ) {
2927 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2930 (~C).store( i, j, xmm1 );
2934 for( ; remainder && j<N; ++j )
2938 for( ; (i+2UL) <= M; i+=2UL )
2940 const size_t kbegin( ( IsUpper<MT4>::value )
2941 ?( ( IsLower<MT5>::value )
2942 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2943 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2944 :( IsLower<MT5>::value ? j : 0UL ) );
2945 const size_t kend( ( IsLower<MT4>::value )
2946 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2949 ElementType value1( (~C)(i ,j) );
2950 ElementType value2( (~C)(i+1UL,j) );;
2952 for(
size_t k=kbegin; k<kend; ++k ) {
2953 value1 += A(i ,k) * B(k,j);
2954 value2 += A(i+1UL,k) * B(k,j);
2957 (~C)(i ,j) = value1;
2958 (~C)(i+1UL,j) = value2;
2963 const size_t kbegin( ( IsUpper<MT4>::value )
2964 ?( ( IsLower<MT5>::value )
2965 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2966 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2967 :( IsLower<MT5>::value ? j : 0UL ) );
2969 ElementType value( (~C)(i,j) );
2971 for(
size_t k=kbegin; k<K; ++k ) {
2972 value += A(i,k) * B(k,j);
2997 template<
typename MT3
3000 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3001 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3003 typedef IntrinsicTrait<ElementType> IT;
3005 const size_t M( A.rows() );
3006 const size_t N( B.columns() );
3007 const size_t K( A.columns() );
3009 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3011 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
3017 for(
size_t j=0UL; j<N; ++j )
3019 const size_t kbegin( ( IsLower<MT5>::value )
3020 ?( ( IsUpper<MT4>::value )
3021 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3022 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3023 :( IsUpper<MT4>::value ? i : 0UL ) );
3024 const size_t kend( ( IsUpper<MT5>::value )
3025 ?( ( IsLower<MT4>::value )
3026 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3027 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3028 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
3030 IntrinsicType xmm1( (~C).load(i ,j) );
3031 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
3032 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
3033 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
3034 IntrinsicType xmm5( (~C).load(i+
IT::size*4UL,j) );
3035 IntrinsicType xmm6( (~C).load(i+
IT::size*5UL,j) );
3036 IntrinsicType xmm7( (~C).load(i+
IT::size*6UL,j) );
3037 IntrinsicType xmm8( (~C).load(i+
IT::size*7UL,j) );
3039 for(
size_t k=kbegin; k<kend; ++k ) {
3040 const IntrinsicType b1(
set( B(k,j) ) );
3041 xmm1 = xmm1 + A.load(i ,k) * b1;
3042 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3043 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3044 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3045 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
3046 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
3047 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
3048 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
3051 (~C).store( i , j, xmm1 );
3052 (~C).store( i+
IT::size , j, xmm2 );
3053 (~C).store( i+
IT::size*2UL, j, xmm3 );
3054 (~C).store( i+
IT::size*3UL, j, xmm4 );
3055 (~C).store( i+
IT::size*4UL, j, xmm5 );
3056 (~C).store( i+
IT::size*5UL, j, xmm6 );
3057 (~C).store( i+
IT::size*6UL, j, xmm7 );
3058 (~C).store( i+
IT::size*7UL, j, xmm8 );
3066 for( ; (j+2UL) <= N; j+=2UL )
3068 const size_t kbegin( ( IsLower<MT5>::value )
3069 ?( ( IsUpper<MT4>::value )
3070 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3071 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3072 :( IsUpper<MT4>::value ? i : 0UL ) );
3073 const size_t kend( ( IsUpper<MT5>::value )
3074 ?( ( IsLower<MT4>::value )
3075 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3076 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3077 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
3079 IntrinsicType xmm1( (~C).load(i ,j ) );
3080 IntrinsicType xmm2( (~C).load(i+
IT::size ,j ) );
3081 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j ) );
3082 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j ) );
3083 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3084 IntrinsicType xmm6( (~C).load(i+
IT::size ,j+1UL) );
3085 IntrinsicType xmm7( (~C).load(i+
IT::size*2UL,j+1UL) );
3086 IntrinsicType xmm8( (~C).load(i+
IT::size*3UL,j+1UL) );
3088 for(
size_t k=kbegin; k<kend; ++k ) {
3089 const IntrinsicType a1( A.load(i ,k) );
3090 const IntrinsicType a2( A.load(i+
IT::size ,k) );
3091 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
3092 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
3093 const IntrinsicType b1(
set( B(k,j ) ) );
3094 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3095 xmm1 = xmm1 + a1 * b1;
3096 xmm2 = xmm2 + a2 * b1;
3097 xmm3 = xmm3 + a3 * b1;
3098 xmm4 = xmm4 + a4 * b1;
3099 xmm5 = xmm5 + a1 * b2;
3100 xmm6 = xmm6 + a2 * b2;
3101 xmm7 = xmm7 + a3 * b2;
3102 xmm8 = xmm8 + a4 * b2;
3105 (~C).store( i , j , xmm1 );
3106 (~C).store( i+
IT::size , j , xmm2 );
3107 (~C).store( i+
IT::size*2UL, j , xmm3 );
3108 (~C).store( i+
IT::size*3UL, j , xmm4 );
3109 (~C).store( i , j+1UL, xmm5 );
3110 (~C).store( i+
IT::size , j+1UL, xmm6 );
3111 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
3112 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
3117 const size_t kbegin( ( IsLower<MT5>::value )
3118 ?( ( IsUpper<MT4>::value )
3119 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3120 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3121 :( IsUpper<MT4>::value ? i : 0UL ) );
3122 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
3124 IntrinsicType xmm1( (~C).load(i ,j) );
3125 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
3126 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
3127 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
3129 for(
size_t k=kbegin; k<kend; ++k ) {
3130 const IntrinsicType b1(
set( B(k,j) ) );
3131 xmm1 = xmm1 + A.load(i ,k) * b1;
3132 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3133 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3134 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3137 (~C).store( i , j, xmm1 );
3138 (~C).store( i+
IT::size , j, xmm2 );
3139 (~C).store( i+
IT::size*2UL, j, xmm3 );
3140 (~C).store( i+
IT::size*3UL, j, xmm4 );
3148 for( ; (j+2UL) <= N; j+=2UL )
3150 const size_t kbegin( ( IsLower<MT5>::value )
3151 ?( ( IsUpper<MT4>::value )
3152 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3153 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3154 :( IsUpper<MT4>::value ? i : 0UL ) );
3155 const size_t kend( ( IsUpper<MT5>::value )
3156 ?( ( IsLower<MT4>::value )
3157 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3158 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3159 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
3161 IntrinsicType xmm1( (~C).load(i ,j ) );
3162 IntrinsicType xmm2( (~C).load(i+
IT::size,j ) );
3163 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3164 IntrinsicType xmm4( (~C).load(i+
IT::size,j+1UL) );
3166 for(
size_t k=kbegin; k<kend; ++k ) {
3167 const IntrinsicType a1( A.load(i ,k) );
3168 const IntrinsicType a2( A.load(i+
IT::size,k) );
3169 const IntrinsicType b1(
set( B(k,j ) ) );
3170 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3171 xmm1 = xmm1 + a1 * b1;
3172 xmm2 = xmm2 + a2 * b1;
3173 xmm3 = xmm3 + a1 * b2;
3174 xmm4 = xmm4 + a2 * b2;
3177 (~C).store( i , j , xmm1 );
3178 (~C).store( i+
IT::size, j , xmm2 );
3179 (~C).store( i , j+1UL, xmm3 );
3180 (~C).store( i+
IT::size, j+1UL, xmm4 );
3185 const size_t kbegin( ( IsLower<MT5>::value )
3186 ?( ( IsUpper<MT4>::value )
3187 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3188 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3189 :( IsUpper<MT4>::value ? i : 0UL ) );
3190 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
3192 IntrinsicType xmm1( (~C).load(i ,j) );
3193 IntrinsicType xmm2( (~C).load(i+
IT::size,j) );
3195 for(
size_t k=kbegin; k<kend; ++k ) {
3196 const IntrinsicType b1(
set( B(k,j) ) );
3197 xmm1 = xmm1 + A.load(i ,k) * b1;
3198 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
3201 (~C).store( i , j, xmm1 );
3210 for( ; (j+2UL) <= N; j+=2UL )
3212 const size_t kbegin( ( IsLower<MT5>::value )
3213 ?( ( IsUpper<MT4>::value )
3214 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3215 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3216 :( IsUpper<MT4>::value ? i : 0UL ) );
3217 const size_t kend( ( IsUpper<MT5>::value )
3218 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3221 IntrinsicType xmm1( (~C).load(i,j ) );
3222 IntrinsicType xmm2( (~C).load(i,j+1UL) );
3224 for(
size_t k=kbegin; k<kend; ++k ) {
3225 const IntrinsicType a1( A.load(i,k) );
3226 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3227 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3230 (~C).store( i, j , xmm1 );
3231 (~C).store( i, j+1UL, xmm2 );
3236 const size_t kbegin( ( IsLower<MT5>::value )
3237 ?( ( IsUpper<MT4>::value )
3238 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3239 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3240 :( IsUpper<MT4>::value ? i : 0UL ) );
3242 IntrinsicType xmm1( (~C).load(i,j) );
3244 for(
size_t k=kbegin; k<K; ++k ) {
3245 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3248 (~C).store( i, j, xmm1 );
3252 for( ; remainder && i<M; ++i )
3256 for( ; (j+2UL) <= N; j+=2UL )
3258 const size_t kbegin( ( IsLower<MT5>::value )
3259 ?( ( IsUpper<MT4>::value )
3260 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3261 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3262 :( IsUpper<MT4>::value ? i : 0UL ) );
3263 const size_t kend( ( IsUpper<MT5>::value )
3264 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3267 ElementType value1( (~C)(i,j ) );
3268 ElementType value2( (~C)(i,j+1UL) );
3270 for(
size_t k=kbegin; k<kend; ++k ) {
3271 value1 += A(i,k) * B(k,j );
3272 value2 += A(i,k) * B(k,j+1UL);
3275 (~C)(i,j ) = value1;
3276 (~C)(i,j+1UL) = value2;
3281 const size_t kbegin( ( IsLower<MT5>::value )
3282 ?( ( IsUpper<MT4>::value )
3283 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3284 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3285 :( IsUpper<MT4>::value ? i : 0UL ) );
3287 ElementType value( (~C)(i,j) );
3289 for(
size_t k=kbegin; k<K; ++k ) {
3290 value += A(i,k) * B(k,j);
3314 template<
typename MT3
3317 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3318 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3320 selectDefaultAddAssignKernel( C, A, B );
3340 template<
typename MT3
3343 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3344 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3346 typedef IntrinsicTrait<ElementType> IT;
3348 const size_t M( A.rows() );
3349 const size_t N( B.columns() );
3350 const size_t K( A.columns() );
3352 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3354 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
3356 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3358 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3361 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
3363 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3365 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
3367 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3379 for( ; (i+2UL) <= iend; i+=2UL )
3381 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3382 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3383 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3384 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3386 IntrinsicType xmm1( (~C).load(i ,j ) );
3387 IntrinsicType xmm2( (~C).load(i ,j1) );
3388 IntrinsicType xmm3( (~C).load(i ,j2) );
3389 IntrinsicType xmm4( (~C).load(i ,j3) );
3390 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3391 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
3392 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
3393 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
3395 for(
size_t k=kbegin; k<kend; ++k ) {
3396 const IntrinsicType a1(
set( A(i ,k) ) );
3397 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3398 const IntrinsicType b1( B.load(k,j ) );
3399 const IntrinsicType b2( B.load(k,j1) );
3400 const IntrinsicType b3( B.load(k,j2) );
3401 const IntrinsicType b4( B.load(k,j3) );
3402 xmm1 = xmm1 + a1 * b1;
3403 xmm2 = xmm2 + a1 * b2;
3404 xmm3 = xmm3 + a1 * b3;
3405 xmm4 = xmm4 + a1 * b4;
3406 xmm5 = xmm5 + a2 * b1;
3407 xmm6 = xmm6 + a2 * b2;
3408 xmm7 = xmm7 + a2 * b3;
3409 xmm8 = xmm8 + a2 * b4;
3412 (~C).store( i , j , xmm1 );
3413 (~C).store( i , j1, xmm2 );
3414 (~C).store( i , j2, xmm3 );
3415 (~C).store( i , j3, xmm4 );
3416 (~C).store( i+1UL, j , xmm5 );
3417 (~C).store( i+1UL, j1, xmm6 );
3418 (~C).store( i+1UL, j2, xmm7 );
3419 (~C).store( i+1UL, j3, xmm8 );
3424 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3425 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3426 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3427 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3429 IntrinsicType xmm1( (~C).load(i,j ) );
3430 IntrinsicType xmm2( (~C).load(i,j1) );
3431 IntrinsicType xmm3( (~C).load(i,j2) );
3432 IntrinsicType xmm4( (~C).load(i,j3) );
3434 for(
size_t k=kbegin; k<kend; ++k ) {
3435 const IntrinsicType a1(
set( A(i,k) ) );
3436 xmm1 = xmm1 + a1 * B.load(k,j );
3437 xmm2 = xmm2 + a1 * B.load(k,j1);
3438 xmm3 = xmm3 + a1 * B.load(k,j2);
3439 xmm4 = xmm4 + a1 * B.load(k,j3);
3442 (~C).store( i, j , xmm1 );
3443 (~C).store( i, j1, xmm2 );
3444 (~C).store( i, j2, xmm3 );
3445 (~C).store( i, j3, xmm4 );
3455 for( ; (i+4UL) <= iend; i+=4UL )
3457 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3458 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3459 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3460 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3462 IntrinsicType xmm1( (~C).load(i ,j ) );
3463 IntrinsicType xmm2( (~C).load(i ,j1) );
3464 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3465 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3466 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
3467 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
3468 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
3469 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
3471 for(
size_t k=kbegin; k<kend; ++k ) {
3472 const IntrinsicType a1(
set( A(i ,k) ) );
3473 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3474 const IntrinsicType a3(
set( A(i+2UL,k) ) );
3475 const IntrinsicType a4(
set( A(i+3UL,k) ) );
3476 const IntrinsicType b1( B.load(k,j ) );
3477 const IntrinsicType b2( B.load(k,j1) );
3478 xmm1 = xmm1 + a1 * b1;
3479 xmm2 = xmm2 + a1 * b2;
3480 xmm3 = xmm3 + a2 * b1;
3481 xmm4 = xmm4 + a2 * b2;
3482 xmm5 = xmm5 + a3 * b1;
3483 xmm6 = xmm6 + a3 * b2;
3484 xmm7 = xmm7 + a4 * b1;
3485 xmm8 = xmm8 + a4 * b2;
3488 (~C).store( i , j , xmm1 );
3489 (~C).store( i , j1, xmm2 );
3490 (~C).store( i+1UL, j , xmm3 );
3491 (~C).store( i+1UL, j1, xmm4 );
3492 (~C).store( i+2UL, j , xmm5 );
3493 (~C).store( i+2UL, j1, xmm6 );
3494 (~C).store( i+3UL, j , xmm7 );
3495 (~C).store( i+3UL, j1, xmm8 );
3498 for( ; (i+2UL) <= iend; i+=2UL )
3500 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3501 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3502 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3503 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3505 IntrinsicType xmm1( (~C).load(i ,j ) );
3506 IntrinsicType xmm2( (~C).load(i ,j1) );
3507 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3508 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3510 for(
size_t k=kbegin; k<kend; ++k ) {
3511 const IntrinsicType a1(
set( A(i ,k) ) );
3512 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3513 const IntrinsicType b1( B.load(k,j ) );
3514 const IntrinsicType b2( B.load(k,j1) );
3515 xmm1 = xmm1 + a1 * b1;
3516 xmm2 = xmm2 + a1 * b2;
3517 xmm3 = xmm3 + a2 * b1;
3518 xmm4 = xmm4 + a2 * b2;
3521 (~C).store( i , j , xmm1 );
3522 (~C).store( i , j1, xmm2 );
3523 (~C).store( i+1UL, j , xmm3 );
3524 (~C).store( i+1UL, j1, xmm4 );
3529 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3530 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3531 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3532 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3534 IntrinsicType xmm1( (~C).load(i,j ) );
3535 IntrinsicType xmm2( (~C).load(i,j1) );
3537 for(
size_t k=kbegin; k<kend; ++k ) {
3538 const IntrinsicType a1(
set( A(i,k) ) );
3539 xmm1 = xmm1 + a1 * B.load(k,j );
3540 xmm2 = xmm2 + a1 * B.load(k,j1);
3543 (~C).store( i, j , xmm1 );
3544 (~C).store( i, j1, xmm2 );
3550 for(
size_t i=ii; i<iend; ++i )
3552 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3553 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3554 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3555 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
3557 IntrinsicType xmm1( (~C).load(i,j) );
3559 for(
size_t k=kbegin; k<kend; ++k ) {
3560 const IntrinsicType a1(
set( A(i,k) ) );
3561 xmm1 = xmm1 + a1 * B.load(k,j);
3564 (~C).store( i, j, xmm1 );
3568 for( ; remainder && j<jend; ++j )
3570 for(
size_t i=ii; i<iend; ++i )
3572 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3573 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3574 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3575 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
3577 ElementType value( (~C)(i,j) );
3579 for(
size_t k=kbegin; k<kend; ++k ) {
3580 value += A(i,k) * B(k,j);
3608 template<
typename MT3
3611 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3612 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3614 typedef IntrinsicTrait<ElementType> IT;
3616 const size_t M( A.rows() );
3617 const size_t N( B.columns() );
3618 const size_t K( A.columns() );
3620 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3622 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
3624 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3626 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3629 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
3631 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3633 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
3635 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3647 for( ; (j+2UL) <= jend; j+=2UL )
3649 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3650 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3651 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3652 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3654 IntrinsicType xmm1( (~C).load(i ,j ) );
3655 IntrinsicType xmm2( (~C).load(i1,j ) );
3656 IntrinsicType xmm3( (~C).load(i2,j ) );
3657 IntrinsicType xmm4( (~C).load(i3,j ) );
3658 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3659 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
3660 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
3661 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
3663 for(
size_t k=kbegin; k<kend; ++k ) {
3664 const IntrinsicType a1( A.load(i ,k) );
3665 const IntrinsicType a2( A.load(i1,k) );
3666 const IntrinsicType a3( A.load(i2,k) );
3667 const IntrinsicType a4( A.load(i3,k) );
3668 const IntrinsicType b1(
set( B(k,j ) ) );
3669 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3670 xmm1 = xmm1 + a1 * b1;
3671 xmm2 = xmm2 + a2 * b1;
3672 xmm3 = xmm3 + a3 * b1;
3673 xmm4 = xmm4 + a4 * b1;
3674 xmm5 = xmm5 + a1 * b2;
3675 xmm6 = xmm6 + a2 * b2;
3676 xmm7 = xmm7 + a3 * b2;
3677 xmm8 = xmm8 + a4 * b2;
3680 (~C).store( i , j , xmm1 );
3681 (~C).store( i1, j , xmm2 );
3682 (~C).store( i2, j , xmm3 );
3683 (~C).store( i3, j , xmm4 );
3684 (~C).store( i , j+1UL, xmm5 );
3685 (~C).store( i1, j+1UL, xmm6 );
3686 (~C).store( i2, j+1UL, xmm7 );
3687 (~C).store( i3, j+1UL, xmm8 );
3692 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3693 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3694 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3695 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3697 IntrinsicType xmm1( (~C).load(i ,j) );
3698 IntrinsicType xmm2( (~C).load(i1,j) );
3699 IntrinsicType xmm3( (~C).load(i2,j) );
3700 IntrinsicType xmm4( (~C).load(i3,j) );
3702 for(
size_t k=kbegin; k<kend; ++k ) {
3703 const IntrinsicType b1(
set( B(k,j) ) );
3704 xmm1 = xmm1 + A.load(i ,k) * b1;
3705 xmm2 = xmm2 + A.load(i1,k) * b1;
3706 xmm3 = xmm3 + A.load(i2,k) * b1;
3707 xmm4 = xmm4 + A.load(i3,k) * b1;
3710 (~C).store( i , j, xmm1 );
3711 (~C).store( i1, j, xmm2 );
3712 (~C).store( i2, j, xmm3 );
3713 (~C).store( i3, j, xmm4 );
3723 for( ; (j+4UL) <= jend; j+=4UL )
3725 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3726 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3727 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3728 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3730 IntrinsicType xmm1( (~C).load(i ,j ) );
3731 IntrinsicType xmm2( (~C).load(i1,j ) );
3732 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3733 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3734 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
3735 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
3736 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
3737 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
3739 for(
size_t k=kbegin; k<kend; ++k ) {
3740 const IntrinsicType a1( A.load(i ,k) );
3741 const IntrinsicType a2( A.load(i1,k) );
3742 const IntrinsicType b1(
set( B(k,j ) ) );
3743 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3744 const IntrinsicType b3(
set( B(k,j+2UL) ) );
3745 const IntrinsicType b4(
set( B(k,j+3UL) ) );
3746 xmm1 = xmm1 + a1 * b1;
3747 xmm2 = xmm2 + a2 * b1;
3748 xmm3 = xmm3 + a1 * b2;
3749 xmm4 = xmm4 + a2 * b2;
3750 xmm5 = xmm5 + a1 * b3;
3751 xmm6 = xmm6 + a2 * b3;
3752 xmm7 = xmm7 + a1 * b4;
3753 xmm8 = xmm8 + a2 * b4;
3756 (~C).store( i , j , xmm1 );
3757 (~C).store( i1, j , xmm2 );
3758 (~C).store( i , j+1UL, xmm3 );
3759 (~C).store( i1, j+1UL, xmm4 );
3760 (~C).store( i , j+2UL, xmm5 );
3761 (~C).store( i1, j+2UL, xmm6 );
3762 (~C).store( i , j+3UL, xmm7 );
3763 (~C).store( i1, j+3UL, xmm8 );
3766 for( ; (j+2UL) <= jend; j+=2UL )
3768 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3769 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3770 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3771 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3773 IntrinsicType xmm1( (~C).load(i ,j ) );
3774 IntrinsicType xmm2( (~C).load(i1,j ) );
3775 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3776 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3778 for(
size_t k=kbegin; k<kend; ++k ) {
3779 const IntrinsicType a1( A.load(i ,k) );
3780 const IntrinsicType a2( A.load(i1,k) );
3781 const IntrinsicType b1(
set( B(k,j ) ) );
3782 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3783 xmm1 = xmm1 + a1 * b1;
3784 xmm2 = xmm2 + a2 * b1;
3785 xmm3 = xmm3 + a1 * b2;
3786 xmm4 = xmm4 + a2 * b2;
3789 (~C).store( i , j , xmm1 );
3790 (~C).store( i1, j , xmm2 );
3791 (~C).store( i , j+1UL, xmm3 );
3792 (~C).store( i1, j+1UL, xmm4 );
3797 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3798 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3799 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3800 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3802 IntrinsicType xmm1( (~C).load(i ,j) );
3803 IntrinsicType xmm2( (~C).load(i1,j) );
3805 for(
size_t k=kbegin; k<kend; ++k ) {
3806 const IntrinsicType b1(
set( B(k,j) ) );
3807 xmm1 = xmm1 + A.load(i ,k) * b1;
3808 xmm2 = xmm2 + A.load(i1,k) * b1;
3811 (~C).store( i , j, xmm1 );
3812 (~C).store( i1, j, xmm2 );
3818 for(
size_t j=jj; j<jend; ++j )
3820 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3821 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3822 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
3823 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3825 IntrinsicType xmm1( (~C).load(i,j) );
3827 for(
size_t k=kbegin; k<kend; ++k ) {
3828 const IntrinsicType b1(
set( B(k,j) ) );
3829 xmm1 = xmm1 + A.load(i,k) * b1;
3832 (~C).store( i, j, xmm1 );
3836 for( ; remainder && i<iend; ++i )
3838 for(
size_t j=jj; j<jend; ++j )
3840 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3841 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3842 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
3843 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3845 ElementType value( (~C)(i,j) );
3847 for(
size_t k=kbegin; k<kend; ++k ) {
3848 value += A(i,k) * B(k,j);
3875 template<
typename MT3
3878 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3879 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3881 selectLargeAddAssignKernel( C, A, B );
3901 template<
typename MT3
3904 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3905 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3909 if( IsTriangular<MT4>::value ) {
3911 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3912 addAssign( C, tmp );
3914 else if( IsTriangular<MT5>::value ) {
3916 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3917 addAssign( C, tmp );
3920 gemm( C, A, B, ET(1), ET(1) );
3944 template<
typename MT
3946 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
3953 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3957 LT A(
serial( rhs.lhs_ ) );
3958 RT B(
serial( rhs.rhs_ ) );
3967 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3983 template<
typename MT3
3986 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3988 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3989 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3990 selectSmallSubAssignKernel( C, A, B );
3992 selectBlasSubAssignKernel( C, A, B );
4011 template<
typename MT3
4014 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4015 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4017 const size_t M( A.rows() );
4018 const size_t N( B.columns() );
4019 const size_t K( A.columns() );
4021 for(
size_t i=0UL; i<M; ++i )
4023 const size_t kbegin( ( IsUpper<MT4>::value )
4024 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4026 const size_t kend( ( IsLower<MT4>::value )
4027 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4031 for(
size_t k=kbegin; k<kend; ++k )
4033 const size_t jbegin( ( IsUpper<MT5>::value )
4034 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4036 const size_t jend( ( IsLower<MT5>::value )
4037 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4041 const size_t jnum( jend - jbegin );
4042 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4044 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4045 (~C)(i,j ) -= A(i,k) * B(k,j );
4046 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4049 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4071 template<
typename MT3
4074 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4075 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4077 const size_t M( A.rows() );
4078 const size_t N( B.columns() );
4079 const size_t K( A.columns() );
4081 for(
size_t j=0UL; j<N; ++j )
4083 const size_t kbegin( ( IsLower<MT5>::value )
4084 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4086 const size_t kend( ( IsUpper<MT5>::value )
4087 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4091 for(
size_t k=kbegin; k<kend; ++k )
4093 const size_t ibegin( ( IsLower<MT4>::value )
4094 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4096 const size_t iend( ( IsUpper<MT4>::value )
4097 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4101 const size_t inum( iend - ibegin );
4102 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4104 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4105 (~C)(i ,j) -= A(i ,k) * B(k,j);
4106 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4109 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4131 template<
typename MT3
4134 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4135 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4137 const size_t M( A.rows() );
4138 const size_t N( B.columns() );
4140 const size_t block( BLOCK_SIZE );
4142 for(
size_t ii=0UL; ii<M; ii+=block ) {
4143 const size_t iend(
min( M, ii+block ) );
4144 for(
size_t jj=0UL; jj<N; jj+=block ) {
4145 const size_t jend(
min( N, jj+block ) );
4146 for(
size_t i=ii; i<iend; ++i )
4148 const size_t jbegin( ( IsUpper<MT4>::value )
4149 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4151 const size_t jpos( ( IsLower<MT4>::value )
4152 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4155 for(
size_t j=jbegin; j<jpos; ++j ) {
4156 (~C)(i,j) -= A(i,j) * B(j,j);
4179 template<
typename MT3
4182 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4183 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4185 const size_t M( A.rows() );
4186 const size_t N( B.columns() );
4188 for(
size_t j=0UL; j<N; ++j )
4190 const size_t ibegin( ( IsLower<MT4>::value )
4191 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4193 const size_t iend( ( IsUpper<MT4>::value )
4194 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4198 const size_t inum( iend - ibegin );
4199 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4201 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4202 (~C)(i ,j) -= A(i ,j) * B(j,j);
4203 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4206 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4227 template<
typename MT3
4230 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4231 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4233 const size_t M( A.rows() );
4234 const size_t N( B.columns() );
4236 for(
size_t i=0UL; i<M; ++i )
4238 const size_t jbegin( ( IsUpper<MT5>::value )
4239 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4241 const size_t jend( ( IsLower<MT5>::value )
4242 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4246 const size_t jnum( jend - jbegin );
4247 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4249 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4250 (~C)(i,j ) -= A(i,i) * B(i,j );
4251 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4254 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4275 template<
typename MT3
4278 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4279 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4281 const size_t M( A.rows() );
4282 const size_t N( B.columns() );
4284 const size_t block( BLOCK_SIZE );
4286 for(
size_t jj=0UL; jj<N; jj+=block ) {
4287 const size_t jend(
min( N, jj+block ) );
4288 for(
size_t ii=0UL; ii<M; ii+=block ) {
4289 const size_t iend(
min( M, ii+block ) );
4290 for(
size_t j=jj; j<jend; ++j )
4292 const size_t ibegin( ( IsLower<MT5>::value )
4293 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4295 const size_t ipos( ( IsUpper<MT5>::value )
4296 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4299 for(
size_t i=ibegin; i<ipos; ++i ) {
4300 (~C)(i,j) -= A(i,i) * B(i,j);
4323 template<
typename MT3
4326 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4327 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4329 for(
size_t i=0UL; i<A.rows(); ++i ) {
4330 C(i,i) -= A(i,i) * B(i,i);
4350 template<
typename MT3
4353 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4354 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4356 selectDefaultSubAssignKernel( C, A, B );
4376 template<
typename MT3
4379 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4380 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4382 typedef IntrinsicTrait<ElementType> IT;
4384 const size_t M( A.rows() );
4385 const size_t N( B.columns() );
4386 const size_t K( A.columns() );
4388 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4390 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
4396 for(
size_t i=0UL; i<M; ++i )
4398 const size_t kbegin( ( IsUpper<MT4>::value )
4399 ?( ( IsLower<MT5>::value )
4400 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4401 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4402 :( IsLower<MT5>::value ? j : 0UL ) );
4403 const size_t kend( ( IsLower<MT4>::value )
4404 ?( ( IsUpper<MT5>::value )
4405 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
4406 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4407 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
4409 IntrinsicType xmm1( (~C).load(i,j ) );
4410 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
4411 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
4412 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
4413 IntrinsicType xmm5( (~C).load(i,j+
IT::size*4UL) );
4414 IntrinsicType xmm6( (~C).load(i,j+
IT::size*5UL) );
4415 IntrinsicType xmm7( (~C).load(i,j+
IT::size*6UL) );
4416 IntrinsicType xmm8( (~C).load(i,j+
IT::size*7UL) );
4418 for(
size_t k=kbegin; k<kend; ++k ) {
4419 const IntrinsicType a1(
set( A(i,k) ) );
4420 xmm1 = xmm1 - a1 * B.load(k,j );
4421 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
4422 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
4423 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
4424 xmm5 = xmm5 - a1 * B.load(k,j+
IT::size*4UL);
4425 xmm6 = xmm6 - a1 * B.load(k,j+
IT::size*5UL);
4426 xmm7 = xmm7 - a1 * B.load(k,j+
IT::size*6UL);
4427 xmm8 = xmm8 - a1 * B.load(k,j+
IT::size*7UL);
4430 (~C).store( i, j , xmm1 );
4431 (~C).store( i, j+
IT::size , xmm2 );
4432 (~C).store( i, j+
IT::size*2UL, xmm3 );
4433 (~C).store( i, j+
IT::size*3UL, xmm4 );
4434 (~C).store( i, j+
IT::size*4UL, xmm5 );
4435 (~C).store( i, j+
IT::size*5UL, xmm6 );
4436 (~C).store( i, j+
IT::size*6UL, xmm7 );
4437 (~C).store( i, j+
IT::size*7UL, xmm8 );
4445 for( ; (i+2UL) <= M; i+=2UL )
4447 const size_t kbegin( ( IsUpper<MT4>::value )
4448 ?( ( IsLower<MT5>::value )
4449 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4450 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4451 :( IsLower<MT5>::value ? j : 0UL ) );
4452 const size_t kend( ( IsLower<MT4>::value )
4453 ?( ( IsUpper<MT5>::value )
4454 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
4455 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4456 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
4458 IntrinsicType xmm1( (~C).load(i ,j ) );
4459 IntrinsicType xmm2( (~C).load(i ,j+
IT::size ) );
4460 IntrinsicType xmm3( (~C).load(i ,j+
IT::size*2UL) );
4461 IntrinsicType xmm4( (~C).load(i ,j+
IT::size*3UL) );
4462 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
4463 IntrinsicType xmm6( (~C).load(i+1UL,j+
IT::size ) );
4464 IntrinsicType xmm7( (~C).load(i+1UL,j+
IT::size*2UL) );
4465 IntrinsicType xmm8( (~C).load(i+1UL,j+
IT::size*3UL) );
4467 for(
size_t k=kbegin; k<kend; ++k ) {
4468 const IntrinsicType a1(
set( A(i ,k) ) );
4469 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4470 const IntrinsicType b1( B.load(k,j ) );
4471 const IntrinsicType b2( B.load(k,j+
IT::size ) );
4472 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
4473 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
4474 xmm1 = xmm1 - a1 * b1;
4475 xmm2 = xmm2 - a1 * b2;
4476 xmm3 = xmm3 - a1 * b3;
4477 xmm4 = xmm4 - a1 * b4;
4478 xmm5 = xmm5 - a2 * b1;
4479 xmm6 = xmm6 - a2 * b2;
4480 xmm7 = xmm7 - a2 * b3;
4481 xmm8 = xmm8 - a2 * b4;
4484 (~C).store( i , j , xmm1 );
4485 (~C).store( i , j+
IT::size , xmm2 );
4486 (~C).store( i , j+
IT::size*2UL, xmm3 );
4487 (~C).store( i , j+
IT::size*3UL, xmm4 );
4488 (~C).store( i+1UL, j , xmm5 );
4489 (~C).store( i+1UL, j+
IT::size , xmm6 );
4490 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
4491 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
4496 const size_t kbegin( ( IsUpper<MT4>::value )
4497 ?( ( IsLower<MT5>::value )
4498 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4499 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4500 :( IsLower<MT5>::value ? j : 0UL ) );
4501 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
4503 IntrinsicType xmm1( (~C).load(i,j ) );
4504 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
4505 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
4506 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
4508 for(
size_t k=kbegin; k<kend; ++k ) {
4509 const IntrinsicType a1(
set( A(i,k) ) );
4510 xmm1 = xmm1 - a1 * B.load(k,j );
4511 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
4512 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
4513 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
4516 (~C).store( i, j , xmm1 );
4517 (~C).store( i, j+
IT::size , xmm2 );
4518 (~C).store( i, j+
IT::size*2UL, xmm3 );
4519 (~C).store( i, j+
IT::size*3UL, xmm4 );
4527 for( ; (i+2UL) <= M; i+=2UL )
4529 const size_t kbegin( ( IsUpper<MT4>::value )
4530 ?( ( IsLower<MT5>::value )
4531 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4532 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4533 :( IsLower<MT5>::value ? j : 0UL ) );
4534 const size_t kend( ( IsLower<MT4>::value )
4535 ?( ( IsUpper<MT5>::value )
4536 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
4537 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4538 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
4540 IntrinsicType xmm1( (~C).load(i ,j ) );
4541 IntrinsicType xmm2( (~C).load(i ,j+
IT::size) );
4542 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
4543 IntrinsicType xmm4( (~C).load(i+1UL,j+
IT::size) );
4545 for(
size_t k=kbegin; k<kend; ++k ) {
4546 const IntrinsicType a1(
set( A(i ,k) ) );
4547 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4548 const IntrinsicType b1( B.load(k,j ) );
4549 const IntrinsicType b2( B.load(k,j+
IT::size) );
4550 xmm1 = xmm1 - a1 * b1;
4551 xmm2 = xmm2 - a1 * b2;
4552 xmm3 = xmm3 - a2 * b1;
4553 xmm4 = xmm4 - a2 * b2;
4556 (~C).store( i , j , xmm1 );
4557 (~C).store( i , j+
IT::size, xmm2 );
4558 (~C).store( i+1UL, j , xmm3 );
4559 (~C).store( i+1UL, j+
IT::size, xmm4 );
4564 const size_t kbegin( ( IsUpper<MT4>::value )
4565 ?( ( IsLower<MT5>::value )
4566 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4567 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4568 :( IsLower<MT5>::value ? j : 0UL ) );
4569 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
4571 IntrinsicType xmm1( (~C).load(i,j ) );
4572 IntrinsicType xmm2( (~C).load(i,j+
IT::size) );
4574 for(
size_t k=kbegin; k<kend; ++k ) {
4575 const IntrinsicType a1(
set( A(i,k) ) );
4576 xmm1 = xmm1 - a1 * B.load(k,j );
4577 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size);
4580 (~C).store( i, j , xmm1 );
4589 for( ; (i+2UL) <= M; i+=2UL )
4591 const size_t kbegin( ( IsUpper<MT4>::value )
4592 ?( ( IsLower<MT5>::value )
4593 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4594 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4595 :( IsLower<MT5>::value ? j : 0UL ) );
4596 const size_t kend( ( IsLower<MT4>::value )
4597 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4600 IntrinsicType xmm1( (~C).load(i ,j) );
4601 IntrinsicType xmm2( (~C).load(i+1UL,j) );
4603 for(
size_t k=kbegin; k<kend; ++k ) {
4604 const IntrinsicType b1( B.load(k,j) );
4605 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
4606 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
4609 (~C).store( i , j, xmm1 );
4610 (~C).store( i+1UL, j, xmm2 );
4615 const size_t kbegin( ( IsUpper<MT4>::value )
4616 ?( ( IsLower<MT5>::value )
4617 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4618 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4619 :( IsLower<MT5>::value ? j : 0UL ) );
4621 IntrinsicType xmm1( (~C).load(i,j) );
4623 for(
size_t k=kbegin; k<K; ++k ) {
4624 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
4627 (~C).store( i, j, xmm1 );
4631 for( ; remainder && j<N; ++j )
4635 for( ; (i+2UL) <= M; i+=2UL )
4637 const size_t kbegin( ( IsUpper<MT4>::value )
4638 ?( ( IsLower<MT5>::value )
4639 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4640 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4641 :( IsLower<MT5>::value ? j : 0UL ) );
4642 const size_t kend( ( IsLower<MT4>::value )
4643 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4646 ElementType value1( (~C)(i ,j) );
4647 ElementType value2( (~C)(i+1UL,j) );
4649 for(
size_t k=kbegin; k<kend; ++k ) {
4650 value1 -= A(i ,k) * B(k,j);
4651 value2 -= A(i+1UL,k) * B(k,j);
4654 (~C)(i ,j) = value1;
4655 (~C)(i+1UL,j) = value2;
4660 const size_t kbegin( ( IsUpper<MT4>::value )
4661 ?( ( IsLower<MT5>::value )
4662 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4663 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4664 :( IsLower<MT5>::value ? j : 0UL ) );
4666 ElementType value( (~C)(i,j) );
4668 for(
size_t k=kbegin; k<K; ++k ) {
4669 value -= A(i,k) * B(k,j);
4694 template<
typename MT3
4697 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4698 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4700 typedef IntrinsicTrait<ElementType> IT;
4702 const size_t M( A.rows() );
4703 const size_t N( B.columns() );
4704 const size_t K( A.columns() );
4706 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4708 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
4714 for(
size_t j=0UL; j<N; ++j )
4716 const size_t kbegin( ( IsLower<MT5>::value )
4717 ?( ( IsUpper<MT4>::value )
4718 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4719 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4720 :( IsUpper<MT4>::value ? i : 0UL ) );
4721 const size_t kend( ( IsUpper<MT5>::value )
4722 ?( ( IsLower<MT4>::value )
4723 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4724 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4725 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
4727 IntrinsicType xmm1( (~C).load(i ,j) );
4728 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
4729 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
4730 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
4731 IntrinsicType xmm5( (~C).load(i+
IT::size*4UL,j) );
4732 IntrinsicType xmm6( (~C).load(i+
IT::size*5UL,j) );
4733 IntrinsicType xmm7( (~C).load(i+
IT::size*6UL,j) );
4734 IntrinsicType xmm8( (~C).load(i+
IT::size*7UL,j) );
4736 for(
size_t k=kbegin; k<kend; ++k ) {
4737 const IntrinsicType b1(
set( B(k,j) ) );
4738 xmm1 = xmm1 - A.load(i ,k) * b1;
4739 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
4740 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
4741 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
4742 xmm5 = xmm5 - A.load(i+
IT::size*4UL,k) * b1;
4743 xmm6 = xmm6 - A.load(i+
IT::size*5UL,k) * b1;
4744 xmm7 = xmm7 - A.load(i+
IT::size*6UL,k) * b1;
4745 xmm8 = xmm8 - A.load(i+
IT::size*7UL,k) * b1;
4748 (~C).store( i , j, xmm1 );
4749 (~C).store( i+
IT::size , j, xmm2 );
4750 (~C).store( i+
IT::size*2UL, j, xmm3 );
4751 (~C).store( i+
IT::size*3UL, j, xmm4 );
4752 (~C).store( i+
IT::size*4UL, j, xmm5 );
4753 (~C).store( i+
IT::size*5UL, j, xmm6 );
4754 (~C).store( i+
IT::size*6UL, j, xmm7 );
4755 (~C).store( i+
IT::size*7UL, j, xmm8 );
4763 for( ; (j+2UL) <= N; j+=2UL )
4765 const size_t kbegin( ( IsLower<MT5>::value )
4766 ?( ( IsUpper<MT4>::value )
4767 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4768 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4769 :( IsUpper<MT4>::value ? i : 0UL ) );
4770 const size_t kend( ( IsUpper<MT5>::value )
4771 ?( ( IsLower<MT4>::value )
4772 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4773 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4774 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
4776 IntrinsicType xmm1( (~C).load(i ,j ) );
4777 IntrinsicType xmm2( (~C).load(i+
IT::size ,j ) );
4778 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j ) );
4779 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j ) );
4780 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
4781 IntrinsicType xmm6( (~C).load(i+
IT::size ,j+1UL) );
4782 IntrinsicType xmm7( (~C).load(i+
IT::size*2UL,j+1UL) );
4783 IntrinsicType xmm8( (~C).load(i+
IT::size*3UL,j+1UL) );
4785 for(
size_t k=kbegin; k<kend; ++k ) {
4786 const IntrinsicType a1( A.load(i ,k) );
4787 const IntrinsicType a2( A.load(i+
IT::size ,k) );
4788 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
4789 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
4790 const IntrinsicType b1(
set( B(k,j ) ) );
4791 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4792 xmm1 = xmm1 - a1 * b1;
4793 xmm2 = xmm2 - a2 * b1;
4794 xmm3 = xmm3 - a3 * b1;
4795 xmm4 = xmm4 - a4 * b1;
4796 xmm5 = xmm5 - a1 * b2;
4797 xmm6 = xmm6 - a2 * b2;
4798 xmm7 = xmm7 - a3 * b2;
4799 xmm8 = xmm8 - a4 * b2;
4802 (~C).store( i , j , xmm1 );
4803 (~C).store( i+
IT::size , j , xmm2 );
4804 (~C).store( i+
IT::size*2UL, j , xmm3 );
4805 (~C).store( i+
IT::size*3UL, j , xmm4 );
4806 (~C).store( i , j+1UL, xmm5 );
4807 (~C).store( i+
IT::size , j+1UL, xmm6 );
4808 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
4809 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
4814 const size_t kbegin( ( IsLower<MT5>::value )
4815 ?( ( IsUpper<MT4>::value )
4816 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4817 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4818 :( IsUpper<MT4>::value ? i : 0UL ) );
4819 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
4821 IntrinsicType xmm1( (~C).load(i ,j) );
4822 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
4823 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
4824 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
4826 for(
size_t k=kbegin; k<kend; ++k ) {
4827 const IntrinsicType b1(
set( B(k,j) ) );
4828 xmm1 = xmm1 - A.load(i ,k) * b1;
4829 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
4830 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
4831 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
4834 (~C).store( i , j, xmm1 );
4835 (~C).store( i+
IT::size , j, xmm2 );
4836 (~C).store( i+
IT::size*2UL, j, xmm3 );
4837 (~C).store( i+
IT::size*3UL, j, xmm4 );
4845 for( ; (j+2UL) <= N; j+=2UL )
4847 const size_t kbegin( ( IsLower<MT5>::value )
4848 ?( ( IsUpper<MT4>::value )
4849 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4850 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4851 :( IsUpper<MT4>::value ? i : 0UL ) );
4852 const size_t kend( ( IsUpper<MT5>::value )
4853 ?( ( IsLower<MT4>::value )
4854 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4855 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4856 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
4858 IntrinsicType xmm1( (~C).load(i ,j ) );
4859 IntrinsicType xmm2( (~C).load(i+
IT::size,j ) );
4860 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
4861 IntrinsicType xmm4( (~C).load(i+
IT::size,j+1UL) );
4863 for(
size_t k=kbegin; k<kend; ++k ) {
4864 const IntrinsicType a1( A.load(i ,k) );
4865 const IntrinsicType a2( A.load(i+
IT::size,k) );
4866 const IntrinsicType b1(
set( B(k,j ) ) );
4867 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4868 xmm1 = xmm1 - a1 * b1;
4869 xmm2 = xmm2 - a2 * b1;
4870 xmm3 = xmm3 - a1 * b2;
4871 xmm4 = xmm4 - a2 * b2;
4874 (~C).store( i , j , xmm1 );
4875 (~C).store( i+
IT::size, j , xmm2 );
4876 (~C).store( i , j+1UL, xmm3 );
4877 (~C).store( i+
IT::size, j+1UL, xmm4 );
4882 const size_t kbegin( ( IsLower<MT5>::value )
4883 ?( ( IsUpper<MT4>::value )
4884 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4885 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4886 :( IsUpper<MT4>::value ? i : 0UL ) );
4887 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
4889 IntrinsicType xmm1( (~C).load(i ,j) );
4890 IntrinsicType xmm2( (~C).load(i+
IT::size,j) );
4892 for(
size_t k=kbegin; k<kend; ++k ) {
4893 const IntrinsicType b1(
set( B(k,j) ) );
4894 xmm1 = xmm1 - A.load(i ,k) * b1;
4895 xmm2 = xmm2 - A.load(i+
IT::size,k) * b1;
4898 (~C).store( i , j, xmm1 );
4907 for( ; (j+2UL) <= N; j+=2UL )
4909 const size_t kbegin( ( IsLower<MT5>::value )
4910 ?( ( IsUpper<MT4>::value )
4911 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4912 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4913 :( IsUpper<MT4>::value ? i : 0UL ) );
4914 const size_t kend( ( IsUpper<MT5>::value )
4915 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4918 IntrinsicType xmm1( (~C).load(i,j ) );
4919 IntrinsicType xmm2( (~C).load(i,j+1UL) );
4921 for(
size_t k=kbegin; k<kend; ++k ) {
4922 const IntrinsicType a1( A.load(i,k) );
4923 xmm1 = xmm1 - a1 *
set( B(k,j ) );
4924 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
4927 (~C).store( i, j , xmm1 );
4928 (~C).store( i, j+1UL, xmm2 );
4933 const size_t kbegin( ( IsLower<MT5>::value )
4934 ?( ( IsUpper<MT4>::value )
4935 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4936 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4937 :( IsUpper<MT4>::value ? i : 0UL ) );
4939 IntrinsicType xmm1( (~C).load(i,j) );
4941 for(
size_t k=kbegin; k<K; ++k ) {
4942 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
4945 (~C).store( i, j, xmm1 );
4949 for( ; remainder && i<M; ++i )
4953 for( ; (j+2UL) <= N; j+=2UL )
4955 const size_t kbegin( ( IsLower<MT5>::value )
4956 ?( ( IsUpper<MT4>::value )
4957 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4958 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4959 :( IsUpper<MT4>::value ? i : 0UL ) );
4960 const size_t kend( ( IsUpper<MT5>::value )
4961 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4964 ElementType value1( (~C)(i,j ) );
4965 ElementType value2( (~C)(i,j+1UL) );
4967 for(
size_t k=kbegin; k<kend; ++k ) {
4968 value1 -= A(i,k) * B(k,j );
4969 value2 -= A(i,k) * B(k,j+1UL);
4972 (~C)(i,j ) = value1;
4973 (~C)(i,j+1UL) = value2;
4978 const size_t kbegin( ( IsLower<MT5>::value )
4979 ?( ( IsUpper<MT4>::value )
4980 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4981 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4982 :( IsUpper<MT4>::value ? i : 0UL ) );
4984 ElementType value( (~C)(i,j) );
4986 for(
size_t k=kbegin; k<K; ++k ) {
4987 value -= A(i,k) * B(k,j);
5011 template<
typename MT3
5014 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5015 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5017 selectDefaultSubAssignKernel( C, A, B );
5037 template<
typename MT3
5040 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5041 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
5043 typedef IntrinsicTrait<ElementType> IT;
5045 const size_t M( A.rows() );
5046 const size_t N( B.columns() );
5047 const size_t K( A.columns() );
5049 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5051 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
5053 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5055 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
5058 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
5060 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5062 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
5064 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5076 for( ; (i+2UL) <= iend; i+=2UL )
5078 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5079 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5080 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5081 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
5083 IntrinsicType xmm1( (~C).load(i ,j ) );
5084 IntrinsicType xmm2( (~C).load(i ,j1) );
5085 IntrinsicType xmm3( (~C).load(i ,j2) );
5086 IntrinsicType xmm4( (~C).load(i ,j3) );
5087 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
5088 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
5089 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
5090 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
5092 for(
size_t k=kbegin; k<kend; ++k ) {
5093 const IntrinsicType a1(
set( A(i ,k) ) );
5094 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5095 const IntrinsicType b1( B.load(k,j ) );
5096 const IntrinsicType b2( B.load(k,j1) );
5097 const IntrinsicType b3( B.load(k,j2) );
5098 const IntrinsicType b4( B.load(k,j3) );
5099 xmm1 = xmm1 - a1 * b1;
5100 xmm2 = xmm2 - a1 * b2;
5101 xmm3 = xmm3 - a1 * b3;
5102 xmm4 = xmm4 - a1 * b4;
5103 xmm5 = xmm5 - a2 * b1;
5104 xmm6 = xmm6 - a2 * b2;
5105 xmm7 = xmm7 - a2 * b3;
5106 xmm8 = xmm8 - a2 * b4;
5109 (~C).store( i , j , xmm1 );
5110 (~C).store( i , j1, xmm2 );
5111 (~C).store( i , j2, xmm3 );
5112 (~C).store( i , j3, xmm4 );
5113 (~C).store( i+1UL, j , xmm5 );
5114 (~C).store( i+1UL, j1, xmm6 );
5115 (~C).store( i+1UL, j2, xmm7 );
5116 (~C).store( i+1UL, j3, xmm8 );
5121 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5122 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5123 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5124 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
5126 IntrinsicType xmm1( (~C).load(i,j ) );
5127 IntrinsicType xmm2( (~C).load(i,j1) );
5128 IntrinsicType xmm3( (~C).load(i,j2) );
5129 IntrinsicType xmm4( (~C).load(i,j3) );
5131 for(
size_t k=kbegin; k<kend; ++k ) {
5132 const IntrinsicType a1(
set( A(i,k) ) );
5133 xmm1 = xmm1 - a1 * B.load(k,j );
5134 xmm2 = xmm2 - a1 * B.load(k,j1);
5135 xmm3 = xmm3 - a1 * B.load(k,j2);
5136 xmm4 = xmm4 - a1 * B.load(k,j3);
5139 (~C).store( i, j , xmm1 );
5140 (~C).store( i, j1, xmm2 );
5141 (~C).store( i, j2, xmm3 );
5142 (~C).store( i, j3, xmm4 );
5152 for( ; (i+4UL) <= iend; i+=4UL )
5154 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5155 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5156 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5157 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5159 IntrinsicType xmm1( (~C).load(i ,j ) );
5160 IntrinsicType xmm2( (~C).load(i ,j1) );
5161 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5162 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5163 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
5164 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
5165 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
5166 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
5168 for(
size_t k=kbegin; k<kend; ++k ) {
5169 const IntrinsicType a1(
set( A(i ,k) ) );
5170 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5171 const IntrinsicType a3(
set( A(i+2UL,k) ) );
5172 const IntrinsicType a4(
set( A(i+3UL,k) ) );
5173 const IntrinsicType b1( B.load(k,j ) );
5174 const IntrinsicType b2( B.load(k,j1) );
5175 xmm1 = xmm1 - a1 * b1;
5176 xmm2 = xmm2 - a1 * b2;
5177 xmm3 = xmm3 - a2 * b1;
5178 xmm4 = xmm4 - a2 * b2;
5179 xmm5 = xmm5 - a3 * b1;
5180 xmm6 = xmm6 - a3 * b2;
5181 xmm7 = xmm7 - a4 * b1;
5182 xmm8 = xmm8 - a4 * b2;
5185 (~C).store( i , j , xmm1 );
5186 (~C).store( i , j1, xmm2 );
5187 (~C).store( i+1UL, j , xmm3 );
5188 (~C).store( i+1UL, j1, xmm4 );
5189 (~C).store( i+2UL, j , xmm5 );
5190 (~C).store( i+2UL, j1, xmm6 );
5191 (~C).store( i+3UL, j , xmm7 );
5192 (~C).store( i+3UL, j1, xmm8 );
5195 for( ; (i+2UL) <= iend; i+=2UL )
5197 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5198 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5199 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5200 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5202 IntrinsicType xmm1( (~C).load(i ,j ) );
5203 IntrinsicType xmm2( (~C).load(i ,j1) );
5204 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
5205 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
5207 for(
size_t k=kbegin; k<kend; ++k ) {
5208 const IntrinsicType a1(
set( A(i ,k) ) );
5209 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5210 const IntrinsicType b1( B.load(k,j ) );
5211 const IntrinsicType b2( B.load(k,j1) );
5212 xmm1 = xmm1 - a1 * b1;
5213 xmm2 = xmm2 - a1 * b2;
5214 xmm3 = xmm3 - a2 * b1;
5215 xmm4 = xmm4 - a2 * b2;
5218 (~C).store( i , j , xmm1 );
5219 (~C).store( i , j1, xmm2 );
5220 (~C).store( i+1UL, j , xmm3 );
5221 (~C).store( i+1UL, j1, xmm4 );
5226 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5227 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5228 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5229 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5231 IntrinsicType xmm1( (~C).load(i,j ) );
5232 IntrinsicType xmm2( (~C).load(i,j1) );
5234 for(
size_t k=kbegin; k<kend; ++k ) {
5235 const IntrinsicType a1(
set( A(i,k) ) );
5236 xmm1 = xmm1 - a1 * B.load(k,j );
5237 xmm2 = xmm2 - a1 * B.load(k,j1);
5240 (~C).store( i, j , xmm1 );
5241 (~C).store( i, j1, xmm2 );
5247 for(
size_t i=ii; i<iend; ++i )
5249 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5250 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5251 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5252 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
5254 IntrinsicType xmm1( (~C).load(i,j) );
5256 for(
size_t k=kbegin; k<kend; ++k ) {
5257 const IntrinsicType a1(
set( A(i,k) ) );
5258 xmm1 = xmm1 - a1 * B.load(k,j);
5261 (~C).store( i, j, xmm1 );
5265 for( ; remainder && j<jend; ++j )
5267 for(
size_t i=ii; i<iend; ++i )
5269 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5270 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5271 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5272 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
5274 ElementType value( (~C)(i,j) );
5276 for(
size_t k=kbegin; k<kend; ++k ) {
5277 value -= A(i,k) * B(k,j);
5305 template<
typename MT3
5308 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5309 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
5311 typedef IntrinsicTrait<ElementType> IT;
5313 const size_t M( A.rows() );
5314 const size_t N( B.columns() );
5315 const size_t K( A.columns() );
5317 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5319 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
5321 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5323 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
5326 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
5328 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5330 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
5332 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5344 for( ; (j+2UL) <= jend; j+=2UL )
5346 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5347 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5348 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5349 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5351 IntrinsicType xmm1( (~C).load(i ,j ) );
5352 IntrinsicType xmm2( (~C).load(i1,j ) );
5353 IntrinsicType xmm3( (~C).load(i2,j ) );
5354 IntrinsicType xmm4( (~C).load(i3,j ) );
5355 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
5356 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
5357 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
5358 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
5360 for(
size_t k=kbegin; k<kend; ++k ) {
5361 const IntrinsicType a1( A.load(i ,k) );
5362 const IntrinsicType a2( A.load(i1,k) );
5363 const IntrinsicType a3( A.load(i2,k) );
5364 const IntrinsicType a4( A.load(i3,k) );
5365 const IntrinsicType b1(
set( B(k,j ) ) );
5366 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5367 xmm1 = xmm1 - a1 * b1;
5368 xmm2 = xmm2 - a2 * b1;
5369 xmm3 = xmm3 - a3 * b1;
5370 xmm4 = xmm4 - a4 * b1;
5371 xmm5 = xmm5 - a1 * b2;
5372 xmm6 = xmm6 - a2 * b2;
5373 xmm7 = xmm7 - a3 * b2;
5374 xmm8 = xmm8 - a4 * b2;
5377 (~C).store( i , j , xmm1 );
5378 (~C).store( i1, j , xmm2 );
5379 (~C).store( i2, j , xmm3 );
5380 (~C).store( i3, j , xmm4 );
5381 (~C).store( i , j+1UL, xmm5 );
5382 (~C).store( i1, j+1UL, xmm6 );
5383 (~C).store( i2, j+1UL, xmm7 );
5384 (~C).store( i3, j+1UL, xmm8 );
5389 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5390 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5391 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5392 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5394 IntrinsicType xmm1( (~C).load(i ,j) );
5395 IntrinsicType xmm2( (~C).load(i1,j) );
5396 IntrinsicType xmm3( (~C).load(i2,j) );
5397 IntrinsicType xmm4( (~C).load(i3,j) );
5399 for(
size_t k=kbegin; k<kend; ++k ) {
5400 const IntrinsicType b1(
set( B(k,j) ) );
5401 xmm1 = xmm1 - A.load(i ,k) * b1;
5402 xmm2 = xmm2 - A.load(i1,k) * b1;
5403 xmm3 = xmm3 - A.load(i2,k) * b1;
5404 xmm4 = xmm4 - A.load(i3,k) * b1;
5407 (~C).store( i , j, xmm1 );
5408 (~C).store( i1, j, xmm2 );
5409 (~C).store( i2, j, xmm3 );
5410 (~C).store( i3, j, xmm4 );
5420 for( ; (j+4UL) <= jend; j+=4UL )
5422 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5423 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5424 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5425 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5427 IntrinsicType xmm1( (~C).load(i ,j ) );
5428 IntrinsicType xmm2( (~C).load(i1,j ) );
5429 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5430 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5431 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
5432 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
5433 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
5434 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
5436 for(
size_t k=kbegin; k<kend; ++k ) {
5437 const IntrinsicType a1( A.load(i ,k) );
5438 const IntrinsicType a2( A.load(i1,k) );
5439 const IntrinsicType b1(
set( B(k,j ) ) );
5440 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5441 const IntrinsicType b3(
set( B(k,j+2UL) ) );
5442 const IntrinsicType b4(
set( B(k,j+3UL) ) );
5443 xmm1 = xmm1 - a1 * b1;
5444 xmm2 = xmm2 - a2 * b1;
5445 xmm3 = xmm3 - a1 * b2;
5446 xmm4 = xmm4 - a2 * b2;
5447 xmm5 = xmm5 - a1 * b3;
5448 xmm6 = xmm6 - a2 * b3;
5449 xmm7 = xmm7 - a1 * b4;
5450 xmm8 = xmm8 - a2 * b4;
5453 (~C).store( i , j , xmm1 );
5454 (~C).store( i1, j , xmm2 );
5455 (~C).store( i , j+1UL, xmm3 );
5456 (~C).store( i1, j+1UL, xmm4 );
5457 (~C).store( i , j+2UL, xmm5 );
5458 (~C).store( i1, j+2UL, xmm6 );
5459 (~C).store( i , j+3UL, xmm7 );
5460 (~C).store( i1, j+3UL, xmm8 );
5463 for( ; (j+2UL) <= jend; j+=2UL )
5465 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5466 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5467 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5468 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5470 IntrinsicType xmm1( (~C).load(i ,j ) );
5471 IntrinsicType xmm2( (~C).load(i1,j ) );
5472 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
5473 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
5475 for(
size_t k=kbegin; k<kend; ++k ) {
5476 const IntrinsicType a1( A.load(i ,k) );
5477 const IntrinsicType a2( A.load(i1,k) );
5478 const IntrinsicType b1(
set( B(k,j ) ) );
5479 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5480 xmm1 = xmm1 - a1 * b1;
5481 xmm2 = xmm2 - a2 * b1;
5482 xmm3 = xmm3 - a1 * b2;
5483 xmm4 = xmm4 - a2 * b2;
5486 (~C).store( i , j , xmm1 );
5487 (~C).store( i1, j , xmm2 );
5488 (~C).store( i , j+1UL, xmm3 );
5489 (~C).store( i1, j+1UL, xmm4 );
5494 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5495 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5496 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5497 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5499 IntrinsicType xmm1( (~C).load(i ,j) );
5500 IntrinsicType xmm2( (~C).load(i1,j) );
5502 for(
size_t k=kbegin; k<kend; ++k ) {
5503 const IntrinsicType b1(
set( B(k,j) ) );
5504 xmm1 = xmm1 - A.load(i ,k) * b1;
5505 xmm2 = xmm2 - A.load(i1,k) * b1;
5508 (~C).store( i , j, xmm1 );
5509 (~C).store( i1, j, xmm2 );
5515 for(
size_t j=jj; j<jend; ++j )
5517 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5518 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5519 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
5520 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5522 IntrinsicType xmm1( (~C).load(i,j) );
5524 for(
size_t k=kbegin; k<kend; ++k ) {
5525 const IntrinsicType b1(
set( B(k,j) ) );
5526 xmm1 = xmm1 - A.load(i,k) * b1;
5529 (~C).store( i, j, xmm1 );
5533 for( ; remainder && i<iend; ++i )
5535 for(
size_t j=jj; j<jend; ++j )
5537 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5538 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5539 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
5540 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5542 ElementType value( (~C)(i,j) );
5544 for(
size_t k=kbegin; k<kend; ++k ) {
5545 value -= A(i,k) * B(k,j);
5572 template<
typename MT3
5575 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
5576 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5578 selectLargeSubAssignKernel( C, A, B );
5598 template<
typename MT3
5601 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
5602 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5606 if( IsTriangular<MT4>::value ) {
5608 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5609 subAssign( C, tmp );
5611 else if( IsTriangular<MT5>::value ) {
5613 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5614 subAssign( C, tmp );
5617 gemm( C, A, B, ET(-1), ET(1) );
5652 template<
typename MT
5654 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5662 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5665 else if( rhs.lhs_.columns() == 0UL ) {
5701 template<
typename MT
5703 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5708 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5720 const TmpType tmp( rhs );
5742 template<
typename MT
5744 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5752 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5791 template<
typename MT
5793 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5801 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5861 template<
typename MT1
5865 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5866 ,
private MatScalarMultExpr
5867 ,
private Computation
5871 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5883 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5888 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5896 template<
typename T1,
typename T2,
typename T3 >
5897 struct IsEvaluationRequired {
5898 enum { value = ( evaluateLeft || evaluateRight ) };
5906 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5907 struct UseBlasKernel {
5909 HasMutableDataAccess<T1>::value &&
5910 HasConstDataAccess<T2>::value &&
5911 HasConstDataAccess<T3>::value &&
5912 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5913 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5914 IsBlasCompatible<typename T1::ElementType>::value &&
5915 IsBlasCompatible<typename T2::ElementType>::value &&
5916 IsBlasCompatible<typename T3::ElementType>::value &&
5917 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
5918 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
5919 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
5927 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5928 struct UseVectorizedDefaultKernel {
5930 !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5931 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5932 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5933 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5934 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
5935 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
5936 IsSame<typename T1::ElementType,T4>::value &&
5937 IntrinsicTrait<typename T1::ElementType>::addition &&
5938 IntrinsicTrait<typename T1::ElementType>::subtraction &&
5939 IntrinsicTrait<typename T1::ElementType>::multiplication };
5945 typedef DMatScalarMultExpr<MMM,ST,true>
This;
5946 typedef typename MultTrait<RES,ST>::Type
ResultType;
5950 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
5955 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
5961 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
5964 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
5969 enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
5970 MT1::vectorizable && MT2::vectorizable &&
5971 IsSame<ET1,ET2>::value &&
5972 IsSame<ET1,ST>::value &&
5973 IntrinsicTrait<ET1>::addition &&
5974 IntrinsicTrait<ET1>::multiplication };
5977 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5978 !evaluateRight && MT2::smpAssignable };
5987 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
6000 inline ResultType
operator()(
size_t i,
size_t j )
const {
6003 return matrix_(i,j) * scalar_;
6015 inline ReturnType
at(
size_t i,
size_t j )
const {
6016 if( i >= matrix_.rows() ) {
6019 if( j >= matrix_.columns() ) {
6022 return (*
this)(i,j);
6031 inline size_t rows()
const {
6032 return matrix_.rows();
6041 inline size_t columns()
const {
6042 return matrix_.columns();
6072 template<
typename T >
6073 inline bool canAlias(
const T* alias )
const {
6074 return matrix_.canAlias( alias );
6084 template<
typename T >
6085 inline bool isAliased(
const T* alias )
const {
6086 return matrix_.isAliased( alias );
6096 return matrix_.isAligned();
6106 typename MMM::RightOperand B( matrix_.rightOperand() );
6108 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
6109 ( B.columns() > SMP_TDMATDMATMULT_THRESHOLD );
6115 LeftOperand matrix_;
6116 RightOperand scalar_;
6131 template<
typename MT
6133 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6140 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6141 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6143 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6146 else if( left.columns() == 0UL ) {
6161 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6176 template<
typename MT3
6180 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6182 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6183 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
6184 selectSmallAssignKernel( C, A, B, scalar );
6186 selectBlasAssignKernel( C, A, B, scalar );
6204 template<
typename MT3
6208 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6209 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6211 const size_t M( A.rows() );
6212 const size_t N( B.columns() );
6213 const size_t K( A.columns() );
6215 for(
size_t i=0UL; i<M; ++i )
6217 const size_t kbegin( ( IsUpper<MT4>::value )
6218 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6220 const size_t kend( ( IsLower<MT4>::value )
6221 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6225 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6226 for(
size_t j=0UL; j<N; ++j ) {
6233 const size_t jbegin( ( IsUpper<MT5>::value )
6234 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6236 const size_t jend( ( IsLower<MT5>::value )
6237 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6241 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6242 for(
size_t j=0UL; j<jbegin; ++j ) {
6246 else if( IsStrictlyUpper<MT5>::value ) {
6247 reset( (~C)(i,0UL) );
6249 for(
size_t j=jbegin; j<jend; ++j ) {
6250 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6252 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6253 for(
size_t j=jend; j<N; ++j ) {
6257 else if( IsStrictlyLower<MT5>::value ) {
6258 reset( (~C)(i,N-1UL) );
6262 for(
size_t k=kbegin+1UL; k<kend; ++k )
6264 const size_t jbegin( ( IsUpper<MT5>::value )
6265 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6267 const size_t jend( ( IsLower<MT5>::value )
6268 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6272 for(
size_t j=jbegin; j<jend; ++j ) {
6273 (~C)(i,j) += A(i,k) * B(k,j);
6275 if( IsLower<MT5>::value ) {
6276 (~C)(i,jend) = A(i,k) * B(k,jend);
6281 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6282 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6284 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6285 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6289 for(
size_t j=jbegin; j<jend; ++j ) {
6290 (~C)(i,j) *= scalar;
6311 template<
typename MT3
6315 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6316 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6318 const size_t M( A.rows() );
6319 const size_t N( B.columns() );
6320 const size_t K( A.columns() );
6322 for(
size_t j=0UL; j<N; ++j )
6324 const size_t kbegin( ( IsLower<MT5>::value )
6325 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6327 const size_t kend( ( IsUpper<MT5>::value )
6328 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6332 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6333 for(
size_t i=0UL; i<M; ++i ) {
6340 const size_t ibegin( ( IsLower<MT4>::value )
6341 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6343 const size_t iend( ( IsUpper<MT4>::value )
6344 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6348 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6349 for(
size_t i=0UL; i<ibegin; ++i ) {
6353 else if( IsStrictlyLower<MT4>::value ) {
6354 reset( (~C)(0UL,j) );
6356 for(
size_t i=ibegin; i<iend; ++i ) {
6357 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6359 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6360 for(
size_t i=iend; i<M; ++i ) {
6364 else if( IsStrictlyUpper<MT4>::value ) {
6365 reset( (~C)(M-1UL,j) );
6369 for(
size_t k=kbegin+1UL; k<kend; ++k )
6371 const size_t ibegin( ( IsLower<MT4>::value )
6372 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6374 const size_t iend( ( IsUpper<MT4>::value )
6375 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6379 for(
size_t i=ibegin; i<iend; ++i ) {
6380 (~C)(i,j) += A(i,k) * B(k,j);
6382 if( IsUpper<MT4>::value ) {
6383 (~C)(iend,j) = A(iend,k) * B(k,j);
6388 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6389 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6391 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6392 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6396 for(
size_t i=ibegin; i<iend; ++i ) {
6397 (~C)(i,j) *= scalar;
6418 template<
typename MT3
6422 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6423 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6425 const size_t M( A.rows() );
6426 const size_t N( B.columns() );
6428 const size_t block( BLOCK_SIZE );
6430 for(
size_t ii=0UL; ii<M; ii+=block ) {
6431 const size_t iend(
min( M, ii+block ) );
6432 for(
size_t jj=0UL; jj<N; jj+=block ) {
6433 const size_t jend(
min( N, jj+block ) );
6434 for(
size_t i=ii; i<iend; ++i )
6436 const size_t jbegin( ( IsUpper<MT4>::value )
6437 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6439 const size_t jpos( ( IsLower<MT4>::value )
6440 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6443 if( IsUpper<MT4>::value ) {
6444 for(
size_t j=jj; j<jbegin; ++j ) {
6448 for(
size_t j=jbegin; j<jpos; ++j ) {
6449 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6451 if( IsLower<MT4>::value ) {
6452 for(
size_t j=jpos; j<jend; ++j ) {
6476 template<
typename MT3
6480 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6481 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6483 const size_t M( A.rows() );
6484 const size_t N( B.columns() );
6486 for(
size_t j=0UL; j<N; ++j )
6488 const size_t ibegin( ( IsLower<MT4>::value )
6489 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6491 const size_t iend( ( IsUpper<MT4>::value )
6492 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6496 if( IsLower<MT4>::value ) {
6497 for(
size_t i=0UL; i<ibegin; ++i ) {
6501 for(
size_t i=ibegin; i<iend; ++i ) {
6502 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6504 if( IsUpper<MT4>::value ) {
6505 for(
size_t i=iend; i<M; ++i ) {
6527 template<
typename MT3
6531 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6532 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6534 const size_t M( A.rows() );
6535 const size_t N( B.columns() );
6537 for(
size_t i=0UL; i<M; ++i )
6539 const size_t jbegin( ( IsUpper<MT5>::value )
6540 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6542 const size_t jend( ( IsLower<MT5>::value )
6543 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6547 if( IsUpper<MT5>::value ) {
6548 for(
size_t j=0UL; j<jbegin; ++j ) {
6552 for(
size_t j=jbegin; j<jend; ++j ) {
6553 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6555 if( IsLower<MT5>::value ) {
6556 for(
size_t j=jend; j<N; ++j ) {
6578 template<
typename MT3
6582 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6583 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6585 const size_t M( A.rows() );
6586 const size_t N( B.columns() );
6588 const size_t block( BLOCK_SIZE );
6590 for(
size_t jj=0UL; jj<N; jj+=block ) {
6591 const size_t jend(
min( N, jj+block ) );
6592 for(
size_t ii=0UL; ii<M; ii+=block ) {
6593 const size_t iend(
min( M, ii+block ) );
6594 for(
size_t j=jj; j<jend; ++j )
6596 const size_t ibegin( ( IsLower<MT5>::value )
6597 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6599 const size_t ipos( ( IsUpper<MT5>::value )
6600 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6603 if( IsLower<MT5>::value ) {
6604 for(
size_t i=ii; i<ibegin; ++i ) {
6608 for(
size_t i=ibegin; i<ipos; ++i ) {
6609 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6611 if( IsUpper<MT5>::value ) {
6612 for(
size_t i=ipos; i<iend; ++i ) {
6636 template<
typename MT3
6640 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6641 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6645 for(
size_t i=0UL; i<A.rows(); ++i ) {
6646 C(i,i) = A(i,i) * B(i,i) * scalar;
6665 template<
typename MT3
6669 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6670 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6672 selectDefaultAssignKernel( C, A, B, scalar );
6691 template<
typename MT3
6695 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6696 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6698 typedef IntrinsicTrait<ElementType> IT;
6700 const size_t M( A.rows() );
6701 const size_t N( B.columns() );
6702 const size_t K( A.columns() );
6704 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6706 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
6709 const IntrinsicType factor(
set( scalar ) );
6714 for(
size_t i=0UL; i<M; ++i )
6716 const size_t kbegin( ( IsUpper<MT4>::value )
6717 ?( ( IsLower<MT5>::value )
6718 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6719 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6720 :( IsLower<MT5>::value ? j : 0UL ) );
6721 const size_t kend( ( IsLower<MT4>::value )
6722 ?( ( IsUpper<MT5>::value )
6723 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
6724 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6725 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
6727 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6729 for(
size_t k=kbegin; k<kend; ++k ) {
6730 const IntrinsicType a1(
set( A(i,k) ) );
6731 xmm1 = xmm1 + a1 * B.load(k,j );
6732 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6733 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6734 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6735 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
6736 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
6737 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
6738 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
6741 (~C).store( i, j , xmm1 * factor );
6742 (~C).store( i, j+
IT::size , xmm2 * factor );
6743 (~C).store( i, j+
IT::size*2UL, xmm3 * factor );
6744 (~C).store( i, j+
IT::size*3UL, xmm4 * factor );
6745 (~C).store( i, j+
IT::size*4UL, xmm5 * factor );
6746 (~C).store( i, j+
IT::size*5UL, xmm6 * factor );
6747 (~C).store( i, j+
IT::size*6UL, xmm7 * factor );
6748 (~C).store( i, j+
IT::size*7UL, xmm8 * factor );
6756 for( ; (i+2UL) <= M; i+=2UL )
6758 const size_t kbegin( ( IsUpper<MT4>::value )
6759 ?( ( IsLower<MT5>::value )
6760 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6761 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6762 :( IsLower<MT5>::value ? j : 0UL ) );
6763 const size_t kend( ( IsLower<MT4>::value )
6764 ?( ( IsUpper<MT5>::value )
6765 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
6766 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6767 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
6769 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6771 for(
size_t k=kbegin; k<kend; ++k ) {
6772 const IntrinsicType a1(
set( A(i ,k) ) );
6773 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6774 const IntrinsicType b1( B.load(k,j ) );
6775 const IntrinsicType b2( B.load(k,j+
IT::size ) );
6776 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
6777 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
6778 xmm1 = xmm1 + a1 * b1;
6779 xmm2 = xmm2 + a1 * b2;
6780 xmm3 = xmm3 + a1 * b3;
6781 xmm4 = xmm4 + a1 * b4;
6782 xmm5 = xmm5 + a2 * b1;
6783 xmm6 = xmm6 + a2 * b2;
6784 xmm7 = xmm7 + a2 * b3;
6785 xmm8 = xmm8 + a2 * b4;
6788 (~C).store( i , j , xmm1 * factor );
6789 (~C).store( i , j+
IT::size , xmm2 * factor );
6790 (~C).store( i , j+
IT::size*2UL, xmm3 * factor );
6791 (~C).store( i , j+
IT::size*3UL, xmm4 * factor );
6792 (~C).store( i+1UL, j , xmm5 * factor );
6793 (~C).store( i+1UL, j+
IT::size , xmm6 * factor );
6794 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 * factor );
6795 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 * factor );
6800 const size_t kbegin( ( IsUpper<MT4>::value )
6801 ?( ( IsLower<MT5>::value )
6802 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6803 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6804 :( IsLower<MT5>::value ? j : 0UL ) );
6805 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
6807 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6809 for(
size_t k=kbegin; k<kend; ++k ) {
6810 const IntrinsicType a1(
set( A(i,k) ) );
6811 xmm1 = xmm1 + a1 * B.load(k,j );
6812 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6813 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6814 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6817 (~C).store( i, j , xmm1 * factor );
6818 (~C).store( i, j+
IT::size , xmm2 * factor );
6819 (~C).store( i, j+
IT::size*2UL, xmm3 * factor );
6820 (~C).store( i, j+
IT::size*3UL, xmm4 * factor );
6828 for( ; (i+2UL) <= M; i+=2UL )
6830 const size_t kbegin( ( IsUpper<MT4>::value )
6831 ?( ( IsLower<MT5>::value )
6832 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6833 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6834 :( IsLower<MT5>::value ? j : 0UL ) );
6835 const size_t kend( ( IsLower<MT4>::value )
6836 ?( ( IsUpper<MT5>::value )
6837 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
6838 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6839 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
6841 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6843 for(
size_t k=kbegin; k<kend; ++k ) {
6844 const IntrinsicType a1(
set( A(i ,k) ) );
6845 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6846 const IntrinsicType b1( B.load(k,j ) );
6847 const IntrinsicType b2( B.load(k,j+
IT::size) );
6848 xmm1 = xmm1 + a1 * b1;
6849 xmm2 = xmm2 + a1 * b2;
6850 xmm3 = xmm3 + a2 * b1;
6851 xmm4 = xmm4 + a2 * b2;
6854 (~C).store( i , j , xmm1 * factor );
6855 (~C).store( i , j+
IT::size, xmm2 * factor );
6856 (~C).store( i+1UL, j , xmm3 * factor );
6857 (~C).store( i+1UL, j+
IT::size, xmm4 * factor );
6862 const size_t kbegin( ( IsUpper<MT4>::value )
6863 ?( ( IsLower<MT5>::value )
6864 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6865 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6866 :( IsLower<MT5>::value ? j : 0UL ) );
6867 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
6869 IntrinsicType xmm1, xmm2;
6871 for(
size_t k=kbegin; k<kend; ++k ) {
6872 const IntrinsicType a1(
set( A(i,k) ) );
6873 xmm1 = xmm1 + a1 * B.load(k,j );
6874 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
6877 (~C).store( i, j , xmm1 * factor );
6878 (~C).store( i, j+
IT::size, xmm2 * factor );
6886 for( ; (i+2UL) <= M; i+=2UL )
6888 const size_t kbegin( ( IsUpper<MT4>::value )
6889 ?( ( IsLower<MT5>::value )
6890 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6891 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6892 :( IsLower<MT5>::value ? j : 0UL ) );
6893 const size_t kend( ( IsLower<MT4>::value )
6894 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6897 IntrinsicType xmm1, xmm2;
6899 for(
size_t k=kbegin; k<kend; ++k ) {
6900 const IntrinsicType b1( B.load(k,j) );
6901 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
6902 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
6905 (~C).store( i , j, xmm1 * factor );
6906 (~C).store( i+1UL, j, xmm2 * factor );
6911 const size_t kbegin( ( IsUpper<MT4>::value )
6912 ?( ( IsLower<MT5>::value )
6913 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6914 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6915 :( IsLower<MT5>::value ? j : 0UL ) );
6919 for(
size_t k=kbegin; k<K; ++k ) {
6920 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
6923 (~C).store( i, j, xmm1 * factor );
6927 for( ; remainder && j<N; ++j )
6931 for( ; (i+2UL) <= M; i+=2UL )
6933 const size_t kbegin( ( IsUpper<MT4>::value )
6934 ?( ( IsLower<MT5>::value )
6935 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6936 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6937 :( IsLower<MT5>::value ? j : 0UL ) );
6938 const size_t kend( ( IsLower<MT4>::value )
6939 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6945 for(
size_t k=kbegin; k<kend; ++k ) {
6946 value1 += A(i ,k) * B(k,j);
6947 value2 += A(i+1UL,k) * B(k,j);
6950 (~C)(i ,j) = value1 * scalar;
6951 (~C)(i+1UL,j) = value2 * scalar;
6956 const size_t kbegin( ( IsUpper<MT4>::value )
6957 ?( ( IsLower<MT5>::value )
6958 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6959 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6960 :( IsLower<MT5>::value ? j : 0UL ) );
6964 for(
size_t k=kbegin; k<K; ++k ) {
6965 value += A(i,k) * B(k,j);
6968 (~C)(i,j) = value * scalar;
6989 template<
typename MT3
6993 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6994 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6996 typedef IntrinsicTrait<ElementType> IT;
6998 const size_t M( A.rows() );
6999 const size_t N( B.columns() );
7000 const size_t K( A.columns() );
7002 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7004 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
7007 const IntrinsicType factor(
set( scalar ) );
7012 for(
size_t j=0UL; j<N; ++j )
7014 const size_t kbegin( ( IsLower<MT5>::value )
7015 ?( ( IsUpper<MT4>::value )
7016 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7017 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7018 :( IsUpper<MT4>::value ? i : 0UL ) );
7019 const size_t kend( ( IsUpper<MT5>::value )
7020 ?( ( IsLower<MT4>::value )
7021 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7022 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7023 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
7025 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7027 for(
size_t k=kbegin; k<kend; ++k ) {
7028 const IntrinsicType b1(
set( B(k,j) ) );
7029 xmm1 = xmm1 + A.load(i ,k) * b1;
7030 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7031 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7032 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7033 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
7034 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
7035 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
7036 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
7039 (~C).store( i , j, xmm1 * factor );
7040 (~C).store( i+
IT::size , j, xmm2 * factor );
7041 (~C).store( i+
IT::size*2UL, j, xmm3 * factor );
7042 (~C).store( i+
IT::size*3UL, j, xmm4 * factor );
7043 (~C).store( i+
IT::size*4UL, j, xmm5 * factor );
7044 (~C).store( i+
IT::size*5UL, j, xmm6 * factor );
7045 (~C).store( i+
IT::size*6UL, j, xmm7 * factor );
7046 (~C).store( i+
IT::size*7UL, j, xmm8 * factor );
7054 for( ; (j+2UL) <= N; j+=2UL )
7056 const size_t kbegin( ( IsLower<MT5>::value )
7057 ?( ( IsUpper<MT4>::value )
7058 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7059 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7060 :( IsUpper<MT4>::value ? i : 0UL ) );
7061 const size_t kend( ( IsUpper<MT5>::value )
7062 ?( ( IsLower<MT4>::value )
7063 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7064 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7065 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
7067 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7069 for(
size_t k=kbegin; k<kend; ++k ) {
7070 const IntrinsicType a1( A.load(i ,k) );
7071 const IntrinsicType a2( A.load(i+
IT::size ,k) );
7072 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
7073 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
7074 const IntrinsicType b1(
set( B(k,j ) ) );
7075 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7076 xmm1 = xmm1 + a1 * b1;
7077 xmm2 = xmm2 + a2 * b1;
7078 xmm3 = xmm3 + a3 * b1;
7079 xmm4 = xmm4 + a4 * b1;
7080 xmm5 = xmm5 + a1 * b2;
7081 xmm6 = xmm6 + a2 * b2;
7082 xmm7 = xmm7 + a3 * b2;
7083 xmm8 = xmm8 + a4 * b2;
7086 (~C).store( i , j , xmm1 * factor );
7087 (~C).store( i+
IT::size , j , xmm2 * factor );
7088 (~C).store( i+
IT::size*2UL, j , xmm3 * factor );
7089 (~C).store( i+
IT::size*3UL, j , xmm4 * factor );
7090 (~C).store( i , j+1UL, xmm5 * factor );
7091 (~C).store( i+
IT::size , j+1UL, xmm6 * factor );
7092 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 * factor );
7093 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 * factor );
7098 const size_t kbegin( ( IsLower<MT5>::value )
7099 ?( ( IsUpper<MT4>::value )
7100 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7101 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7102 :( IsUpper<MT4>::value ? i : 0UL ) );
7103 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
7105 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7107 for(
size_t k=kbegin; k<kend; ++k ) {
7108 const IntrinsicType b1(
set( B(k,j) ) );
7109 xmm1 = xmm1 + A.load(i ,k) * b1;
7110 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7111 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7112 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7115 (~C).store( i , j, xmm1 * factor );
7116 (~C).store( i+
IT::size , j, xmm2 * factor );
7117 (~C).store( i+
IT::size*2UL, j, xmm3 * factor );
7118 (~C).store( i+
IT::size*3UL, j, xmm4 * factor );
7126 for( ; (j+2UL) <= N; j+=2UL )
7128 const size_t kbegin( ( IsLower<MT5>::value )
7129 ?( ( IsUpper<MT4>::value )
7130 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7131 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7132 :( IsUpper<MT4>::value ? i : 0UL ) );
7133 const size_t kend( ( IsUpper<MT5>::value )
7134 ?( ( IsLower<MT4>::value )
7135 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7136 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7137 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
7139 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7141 for(
size_t k=kbegin; k<kend; ++k ) {
7142 const IntrinsicType a1( A.load(i ,k) );
7143 const IntrinsicType a2( A.load(i+
IT::size,k) );
7144 const IntrinsicType b1(
set( B(k,j ) ) );
7145 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7146 xmm1 = xmm1 + a1 * b1;
7147 xmm2 = xmm2 + a2 * b1;
7148 xmm3 = xmm3 + a1 * b2;
7149 xmm4 = xmm4 + a2 * b2;
7152 (~C).store( i , j , xmm1 * factor );
7153 (~C).store( i+
IT::size, j , xmm2 * factor );
7154 (~C).store( i , j+1UL, xmm3 * factor );
7155 (~C).store( i+
IT::size, j+1UL, xmm4 * factor );
7160 const size_t kbegin( ( IsLower<MT5>::value )
7161 ?( ( IsUpper<MT4>::value )
7162 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7163 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7164 :( IsUpper<MT4>::value ? i : 0UL ) );
7165 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
7167 IntrinsicType xmm1, xmm2;
7169 for(
size_t k=kbegin; k<kend; ++k ) {
7170 const IntrinsicType b1(
set( B(k,j) ) );
7171 xmm1 = xmm1 + A.load(i ,k) * b1;
7172 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
7175 (~C).store( i , j, xmm1 * factor );
7176 (~C).store( i+
IT::size, j, xmm2 * factor );
7184 for( ; (j+2UL) <= N; j+=2UL )
7186 const size_t kbegin( ( IsLower<MT5>::value )
7187 ?( ( IsUpper<MT4>::value )
7188 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7189 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7190 :( IsUpper<MT4>::value ? i : 0UL ) );
7191 const size_t kend( ( IsUpper<MT5>::value )
7192 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7195 IntrinsicType xmm1, xmm2;
7197 for(
size_t k=kbegin; k<kend; ++k ) {
7198 const IntrinsicType a1( A.load(i,k) );
7199 xmm1 = xmm1 + a1 *
set( B(k,j ) );
7200 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
7203 (~C).store( i, j , xmm1 * factor );
7204 (~C).store( i, j+1UL, xmm2 * factor );
7209 const size_t kbegin( ( IsLower<MT5>::value )
7210 ?( ( IsUpper<MT4>::value )
7211 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7212 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7213 :( IsUpper<MT4>::value ? i : 0UL ) );
7217 for(
size_t k=kbegin; k<K; ++k ) {
7218 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
7221 (~C).store( i, j, xmm1 * factor );
7225 for( ; remainder && i<M; ++i )
7229 for( ; (j+2UL) <= N; j+=2UL )
7231 const size_t kbegin( ( IsLower<MT5>::value )
7232 ?( ( IsUpper<MT4>::value )
7233 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7234 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7235 :( IsUpper<MT4>::value ? i : 0UL ) );
7236 const size_t kend( ( IsUpper<MT5>::value )
7237 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7243 for(
size_t k=kbegin; k<kend; ++k ) {
7244 value1 += A(i,k) * B(k,j );
7245 value2 += A(i,k) * B(k,j+1UL);
7248 (~C)(i,j ) = value1 * scalar;
7249 (~C)(i,j+1UL) = value2 * scalar;
7254 const size_t kbegin( ( IsLower<MT5>::value )
7255 ?( ( IsUpper<MT4>::value )
7256 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7257 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7258 :( IsUpper<MT4>::value ? i : 0UL ) );
7262 for(
size_t k=kbegin; k<K; ++k ) {
7263 value += A(i,k) * B(k,j);
7266 (~C)(i,j) = value * scalar;
7286 template<
typename MT3
7290 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7291 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7293 selectDefaultAssignKernel( C, A, B, scalar );
7312 template<
typename MT3
7316 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7317 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7319 typedef IntrinsicTrait<ElementType> IT;
7321 const size_t M( A.rows() );
7322 const size_t N( B.columns() );
7323 const size_t K( A.columns() );
7325 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7327 const IntrinsicType factor(
set( scalar ) );
7329 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
7331 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7333 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
7336 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
7338 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7340 for(
size_t i=ii; i<iend; ++i ) {
7341 for(
size_t j=jj; j<jend; ++j ) {
7346 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
7348 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7360 for( ; (i+2UL) <= iend; i+=2UL )
7362 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7363 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7364 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7365 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7367 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7369 for(
size_t k=kbegin; k<kend; ++k ) {
7370 const IntrinsicType a1(
set( A(i ,k) ) );
7371 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7372 const IntrinsicType b1( B.load(k,j ) );
7373 const IntrinsicType b2( B.load(k,j1) );
7374 const IntrinsicType b3( B.load(k,j2) );
7375 const IntrinsicType b4( B.load(k,j3) );
7376 xmm1 = xmm1 + a1 * b1;
7377 xmm2 = xmm2 + a1 * b2;
7378 xmm3 = xmm3 + a1 * b3;
7379 xmm4 = xmm4 + a1 * b4;
7380 xmm5 = xmm5 + a2 * b1;
7381 xmm6 = xmm6 + a2 * b2;
7382 xmm7 = xmm7 + a2 * b3;
7383 xmm8 = xmm8 + a2 * b4;
7386 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7387 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7388 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
7389 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
7390 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
7391 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
7392 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
7393 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
7398 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7399 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7400 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7401 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7403 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7405 for(
size_t k=kbegin; k<kend; ++k ) {
7406 const IntrinsicType a1(
set( A(i,k) ) );
7407 xmm1 = xmm1 + a1 * B.load(k,j );
7408 xmm2 = xmm2 + a1 * B.load(k,j1);
7409 xmm3 = xmm3 + a1 * B.load(k,j2);
7410 xmm4 = xmm4 + a1 * B.load(k,j3);
7413 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7414 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7415 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
7416 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
7426 for( ; (i+4UL) <= iend; i+=4UL )
7428 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7429 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7430 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7431 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7433 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7435 for(
size_t k=kbegin; k<kend; ++k ) {
7436 const IntrinsicType a1(
set( A(i ,k) ) );
7437 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7438 const IntrinsicType a3(
set( A(i+2UL,k) ) );
7439 const IntrinsicType a4(
set( A(i+3UL,k) ) );
7440 const IntrinsicType b1( B.load(k,j ) );
7441 const IntrinsicType b2( B.load(k,j1) );
7442 xmm1 = xmm1 + a1 * b1;
7443 xmm2 = xmm2 + a1 * b2;
7444 xmm3 = xmm3 + a2 * b1;
7445 xmm4 = xmm4 + a2 * b2;
7446 xmm5 = xmm5 + a3 * b1;
7447 xmm6 = xmm6 + a3 * b2;
7448 xmm7 = xmm7 + a4 * b1;
7449 xmm8 = xmm8 + a4 * b2;
7452 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7453 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7454 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7455 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7456 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
7457 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
7458 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
7459 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
7462 for( ; (i+2UL) <= iend; i+=2UL )
7464 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7465 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7466 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7467 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7469 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7471 for(
size_t k=kbegin; k<kend; ++k ) {
7472 const IntrinsicType a1(
set( A(i ,k) ) );
7473 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7474 const IntrinsicType b1( B.load(k,j ) );
7475 const IntrinsicType b2( B.load(k,j1) );
7476 xmm1 = xmm1 + a1 * b1;
7477 xmm2 = xmm2 + a1 * b2;
7478 xmm3 = xmm3 + a2 * b1;
7479 xmm4 = xmm4 + a2 * b2;
7482 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7483 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7484 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7485 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7490 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7491 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7492 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7493 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7495 IntrinsicType xmm1, xmm2;
7497 for(
size_t k=kbegin; k<kend; ++k ) {
7498 const IntrinsicType a1(
set( A(i,k) ) );
7499 xmm1 = xmm1 + a1 * B.load(k,j );
7500 xmm2 = xmm2 + a1 * B.load(k,j1);
7503 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7504 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7510 for(
size_t i=ii; i<iend; ++i )
7512 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7513 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7514 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7515 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
7519 for(
size_t k=kbegin; k<kend; ++k ) {
7520 const IntrinsicType a1(
set( A(i,k) ) );
7521 xmm1 = xmm1 + a1 * B.load(k,j);
7524 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7528 for( ; remainder && j<jend; ++j )
7530 for(
size_t i=ii; i<iend; ++i )
7532 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7533 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7534 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7535 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
7539 for(
size_t k=kbegin; k<kend; ++k ) {
7540 value += A(i,k) * B(k,j);
7543 (~C)(i,j) += value * scalar;
7567 template<
typename MT3
7571 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7572 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7574 typedef IntrinsicTrait<ElementType> IT;
7576 const size_t M( A.rows() );
7577 const size_t N( B.columns() );
7578 const size_t K( A.columns() );
7580 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7582 const IntrinsicType factor(
set( scalar ) );
7584 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
7586 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7588 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
7591 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
7593 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7595 for(
size_t j=jj; j<jend; ++j ) {
7596 for(
size_t i=ii; i<iend; ++i ) {
7601 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
7603 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7615 for( ; (j+2UL) <= jend; j+=2UL )
7617 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7618 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7619 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7620 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7622 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7624 for(
size_t k=kbegin; k<kend; ++k ) {
7625 const IntrinsicType a1( A.load(i ,k) );
7626 const IntrinsicType a2( A.load(i1,k) );
7627 const IntrinsicType a3( A.load(i2,k) );
7628 const IntrinsicType a4( A.load(i3,k) );
7629 const IntrinsicType b1(
set( B(k,j ) ) );
7630 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7631 xmm1 = xmm1 + a1 * b1;
7632 xmm2 = xmm2 + a2 * b1;
7633 xmm3 = xmm3 + a3 * b1;
7634 xmm4 = xmm4 + a4 * b1;
7635 xmm5 = xmm5 + a1 * b2;
7636 xmm6 = xmm6 + a2 * b2;
7637 xmm7 = xmm7 + a3 * b2;
7638 xmm8 = xmm8 + a4 * b2;
7641 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7642 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7643 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
7644 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
7645 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
7646 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
7647 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
7648 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
7653 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7654 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7655 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7656 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7658 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7660 for(
size_t k=kbegin; k<kend; ++k ) {
7661 const IntrinsicType b1(
set( B(k,j) ) );
7662 xmm1 = xmm1 + A.load(i ,k) * b1;
7663 xmm2 = xmm2 + A.load(i1,k) * b1;
7664 xmm3 = xmm3 + A.load(i2,k) * b1;
7665 xmm4 = xmm4 + A.load(i3,k) * b1;
7668 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7669 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7670 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
7671 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
7681 for( ; (j+4UL) <= jend; j+=4UL )
7683 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7684 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7685 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7686 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7688 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7690 for(
size_t k=kbegin; k<kend; ++k ) {
7691 const IntrinsicType a1( A.load(i ,k) );
7692 const IntrinsicType a2( A.load(i1,k) );
7693 const IntrinsicType b1(
set( B(k,j ) ) );
7694 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7695 const IntrinsicType b3(
set( B(k,j+2UL) ) );
7696 const IntrinsicType b4(
set( B(k,j+3UL) ) );
7697 xmm1 = xmm1 + a1 * b1;
7698 xmm2 = xmm2 + a2 * b1;
7699 xmm3 = xmm3 + a1 * b2;
7700 xmm4 = xmm4 + a2 * b2;
7701 xmm5 = xmm5 + a1 * b3;
7702 xmm6 = xmm6 + a2 * b3;
7703 xmm7 = xmm7 + a1 * b4;
7704 xmm8 = xmm8 + a2 * b4;
7707 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7708 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7709 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7710 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7711 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
7712 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
7713 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
7714 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
7717 for( ; (j+2UL) <= jend; j+=2UL )
7719 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7720 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7721 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7722 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7724 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7726 for(
size_t k=kbegin; k<kend; ++k ) {
7727 const IntrinsicType a1( A.load(i ,k) );
7728 const IntrinsicType a2( A.load(i1,k) );
7729 const IntrinsicType b1(
set( B(k,j ) ) );
7730 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7731 xmm1 = xmm1 + a1 * b1;
7732 xmm2 = xmm2 + a2 * b1;
7733 xmm3 = xmm3 + a1 * b2;
7734 xmm4 = xmm4 + a2 * b2;
7737 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7738 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7739 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7740 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7745 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7746 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7747 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7748 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7750 IntrinsicType xmm1, xmm2;
7752 for(
size_t k=kbegin; k<kend; ++k ) {
7753 const IntrinsicType b1(
set( B(k,j) ) );
7754 xmm1 = xmm1 + A.load(i ,k) * b1;
7755 xmm2 = xmm2 + A.load(i1,k) * b1;
7758 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7759 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7765 for(
size_t j=jj; j<jend; ++j )
7767 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7768 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7769 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
7770 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7774 for(
size_t k=kbegin; k<kend; ++k ) {
7775 const IntrinsicType b1(
set( B(k,j) ) );
7776 xmm1 = xmm1 + A.load(i,k) * b1;
7779 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7783 for( ; remainder && i<iend; ++i )
7785 for(
size_t j=jj; j<jend; ++j )
7787 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7788 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7789 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
7790 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7794 for(
size_t k=kbegin; k<kend; ++k ) {
7795 value += A(i,k) * B(k,j);
7798 (~C)(i,j) += value * scalar;
7821 template<
typename MT3
7825 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7826 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7828 selectLargeAssignKernel( C, A, B, scalar );
7847 template<
typename MT3
7851 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7852 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7856 if( IsTriangular<MT4>::value ) {
7858 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7860 else if( IsTriangular<MT5>::value ) {
7862 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7865 gemm( C, A, B, ET(scalar), ET(0) );
7883 template<
typename MT
7885 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7889 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7901 const TmpType tmp(
serial( rhs ) );
7902 assign( ~lhs, tmp );
7918 template<
typename MT
7920 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7927 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7928 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7930 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7944 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
7959 template<
typename MT3
7963 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7965 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
7966 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7967 selectSmallAddAssignKernel( C, A, B, scalar );
7969 selectBlasAddAssignKernel( C, A, B, scalar );
7987 template<
typename MT3
7991 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
7992 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7994 const ResultType tmp(
serial( A * B * scalar ) );
7995 addAssign( C, tmp );
8013 template<
typename MT3
8017 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8018 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8020 const size_t M( A.rows() );
8021 const size_t N( B.columns() );
8023 const size_t block( BLOCK_SIZE );
8025 for(
size_t ii=0UL; ii<M; ii+=block ) {
8026 const size_t iend(
min( M, ii+block ) );
8027 for(
size_t jj=0UL; jj<N; jj+=block ) {
8028 const size_t jend(
min( N, jj+block ) );
8029 for(
size_t i=ii; i<iend; ++i )
8031 const size_t jbegin( ( IsUpper<MT4>::value )
8032 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8034 const size_t jpos( ( IsLower<MT4>::value )
8035 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8038 for(
size_t j=jbegin; j<jpos; ++j ) {
8039 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8061 template<
typename MT3
8065 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8066 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8068 const size_t M( A.rows() );
8069 const size_t N( B.columns() );
8071 for(
size_t j=0UL; j<N; ++j )
8073 const size_t ibegin( ( IsLower<MT4>::value )
8074 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8076 const size_t iend( ( IsUpper<MT4>::value )
8077 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8081 const size_t inum( iend - ibegin );
8082 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
8084 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
8085 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8086 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8089 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8109 template<
typename MT3
8113 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8114 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8116 const size_t M( A.rows() );
8117 const size_t N( B.columns() );
8119 for(
size_t i=0UL; i<M; ++i )
8121 const size_t jbegin( ( IsUpper<MT5>::value )
8122 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8124 const size_t jend( ( IsLower<MT5>::value )
8125 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8129 const size_t jnum( jend - jbegin );
8130 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
8132 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
8133 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8134 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8137 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8157 template<
typename MT3
8161 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8162 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8164 const size_t M( A.rows() );
8165 const size_t N( B.columns() );
8167 const size_t block( BLOCK_SIZE );
8169 for(
size_t jj=0UL; jj<N; jj+=block ) {
8170 const size_t jend(
min( N, jj+block ) );
8171 for(
size_t ii=0UL; ii<M; ii+=block ) {
8172 const size_t iend(
min( M, ii+block ) );
8173 for(
size_t j=jj; j<jend; ++j )
8175 const size_t ibegin( ( IsLower<MT5>::value )
8176 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8178 const size_t ipos( ( IsUpper<MT5>::value )
8179 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8182 for(
size_t i=ibegin; i<ipos; ++i ) {
8183 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8205 template<
typename MT3
8209 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
8210 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8212 for(
size_t i=0UL; i<A.rows(); ++i ) {
8213 C(i,i) += A(i,i) * B(i,i) * scalar;
8232 template<
typename MT3
8236 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8237 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8239 selectDefaultAddAssignKernel( C, A, B, scalar );
8258 template<
typename MT3
8262 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8263 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8265 typedef IntrinsicTrait<ElementType> IT;
8267 const size_t M( A.rows() );
8268 const size_t N( B.columns() );
8269 const size_t K( A.columns() );
8271 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8273 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
8276 const IntrinsicType factor(
set( scalar ) );
8281 for(
size_t i=0UL; i<M; ++i )
8283 const size_t kbegin( ( IsUpper<MT4>::value )
8284 ?( ( IsLower<MT5>::value )
8285 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8286 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8287 :( IsLower<MT5>::value ? j : 0UL ) );
8288 const size_t kend( ( IsLower<MT4>::value )
8289 ?( ( IsUpper<MT5>::value )
8290 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
8291 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8292 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
8294 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8296 for(
size_t k=kbegin; k<kend; ++k ) {
8297 const IntrinsicType a1(
set( A(i,k) ) );
8298 xmm1 = xmm1 + a1 * B.load(k,j );
8299 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
8300 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
8301 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
8302 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
8303 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
8304 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
8305 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
8308 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8310 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) + xmm3 * factor );
8311 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) + xmm4 * factor );
8312 (~C).store( i, j+
IT::size*4UL, (~C).load(i,j+
IT::size*4UL) + xmm5 * factor );
8313 (~C).store( i, j+
IT::size*5UL, (~C).load(i,j+
IT::size*5UL) + xmm6 * factor );
8314 (~C).store( i, j+
IT::size*6UL, (~C).load(i,j+
IT::size*6UL) + xmm7 * factor );
8315 (~C).store( i, j+
IT::size*7UL, (~C).load(i,j+
IT::size*7UL) + xmm8 * factor );
8323 for( ; (i+2UL) <= M; i+=2UL )
8325 const size_t kbegin( ( IsUpper<MT4>::value )
8326 ?( ( IsLower<MT5>::value )
8327 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8328 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8329 :( IsLower<MT5>::value ? j : 0UL ) );
8330 const size_t kend( ( IsLower<MT4>::value )
8331 ?( ( IsUpper<MT5>::value )
8332 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
8333 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8334 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
8336 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8338 for(
size_t k=kbegin; k<kend; ++k ) {
8339 const IntrinsicType a1(
set( A(i ,k) ) );
8340 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8341 const IntrinsicType b1( B.load(k,j ) );
8342 const IntrinsicType b2( B.load(k,j+
IT::size ) );
8343 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
8344 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
8345 xmm1 = xmm1 + a1 * b1;
8346 xmm2 = xmm2 + a1 * b2;
8347 xmm3 = xmm3 + a1 * b3;
8348 xmm4 = xmm4 + a1 * b4;
8349 xmm5 = xmm5 + a2 * b1;
8350 xmm6 = xmm6 + a2 * b2;
8351 xmm7 = xmm7 + a2 * b3;
8352 xmm8 = xmm8 + a2 * b4;
8355 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8357 (~C).store( i , j+
IT::size*2UL, (~C).load(i ,j+
IT::size*2UL) + xmm3 * factor );
8358 (~C).store( i , j+
IT::size*3UL, (~C).load(i ,j+
IT::size*3UL) + xmm4 * factor );
8359 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8360 (~C).store( i+1UL, j+
IT::size , (~C).load(i+1UL,j+
IT::size ) + xmm6 * factor );
8361 (~C).store( i+1UL, j+
IT::size*2UL, (~C).load(i+1UL,j+
IT::size*2UL) + xmm7 * factor );
8362 (~C).store( i+1UL, j+
IT::size*3UL, (~C).load(i+1UL,j+
IT::size*3UL) + xmm8 * factor );
8367 const size_t kbegin( ( IsUpper<MT4>::value )
8368 ?( ( IsLower<MT5>::value )
8369 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8370 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8371 :( IsLower<MT5>::value ? j : 0UL ) );
8372 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
8374 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8376 for(
size_t k=kbegin; k<kend; ++k ) {
8377 const IntrinsicType a1(
set( A(i,k) ) );
8378 xmm1 = xmm1 + a1 * B.load(k,j );
8379 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
8380 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
8381 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
8384 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8386 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) + xmm3 * factor );
8387 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) + xmm4 * factor );
8395 for( ; (i+2UL) <= M; i+=2UL )
8397 const size_t kbegin( ( IsUpper<MT4>::value )
8398 ?( ( IsLower<MT5>::value )
8399 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8400 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8401 :( IsLower<MT5>::value ? j : 0UL ) );
8402 const size_t kend( ( IsLower<MT4>::value )
8403 ?( ( IsUpper<MT5>::value )
8404 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
8405 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8406 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
8408 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8410 for(
size_t k=kbegin; k<kend; ++k ) {
8411 const IntrinsicType a1(
set( A(i ,k) ) );
8412 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8413 const IntrinsicType b1( B.load(k,j ) );
8414 const IntrinsicType b2( B.load(k,j+
IT::size) );
8415 xmm1 = xmm1 + a1 * b1;
8416 xmm2 = xmm2 + a1 * b2;
8417 xmm3 = xmm3 + a2 * b1;
8418 xmm4 = xmm4 + a2 * b2;
8421 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8423 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8424 (~C).store( i+1UL, j+
IT::size, (~C).load(i+1UL,j+
IT::size) + xmm4 * factor );
8429 const size_t kbegin( ( IsUpper<MT4>::value )
8430 ?( ( IsLower<MT5>::value )
8431 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8432 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8433 :( IsLower<MT5>::value ? j : 0UL ) );
8434 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
8436 IntrinsicType xmm1, xmm2;
8438 for(
size_t k=kbegin; k<kend; ++k ) {
8439 const IntrinsicType a1(
set( A(i,k) ) );
8440 xmm1 = xmm1 + a1 * B.load(k,j );
8441 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
8444 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8453 for( ; (i+2UL) <= M; i+=2UL )
8455 const size_t kbegin( ( IsUpper<MT4>::value )
8456 ?( ( IsLower<MT5>::value )
8457 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8458 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8459 :( IsLower<MT5>::value ? j : 0UL ) );
8460 const size_t kend( ( IsLower<MT4>::value )
8461 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8464 IntrinsicType xmm1, xmm2;
8466 for(
size_t k=kbegin; k<kend; ++k ) {
8467 const IntrinsicType b1( B.load(k,j) );
8468 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
8469 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
8472 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8473 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8478 const size_t kbegin( ( IsUpper<MT4>::value )
8479 ?( ( IsLower<MT5>::value )
8480 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8481 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8482 :( IsLower<MT5>::value ? j : 0UL ) );
8486 for(
size_t k=kbegin; k<K; ++k ) {
8487 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
8490 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8494 for( ; remainder && j<N; ++j )
8498 for( ; (i+2UL) <= M; i+=2UL )
8500 const size_t kbegin( ( IsUpper<MT4>::value )
8501 ?( ( IsLower<MT5>::value )
8502 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8503 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8504 :( IsLower<MT5>::value ? j : 0UL ) );
8505 const size_t kend( ( IsLower<MT4>::value )
8506 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8512 for(
size_t k=kbegin; k<kend; ++k ) {
8513 value1 += A(i ,k) * B(k,j);
8514 value2 += A(i+1UL,k) * B(k,j);
8517 (~C)(i ,j) += value1 * scalar;
8518 (~C)(i+1UL,j) += value2 * scalar;
8523 const size_t kbegin( ( IsUpper<MT4>::value )
8524 ?( ( IsLower<MT5>::value )
8525 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8526 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8527 :( IsLower<MT5>::value ? j : 0UL ) );
8531 for(
size_t k=kbegin; k<K; ++k ) {
8532 value += A(i,k) * B(k,j);
8535 (~C)(i,j) += value * scalar;
8556 template<
typename MT3
8560 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8561 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8563 typedef IntrinsicTrait<ElementType> IT;
8565 const size_t M( A.rows() );
8566 const size_t N( B.columns() );
8567 const size_t K( A.columns() );
8569 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8571 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
8574 const IntrinsicType factor(
set( scalar ) );
8579 for(
size_t j=0UL; j<N; ++j )
8581 const size_t kbegin( ( IsLower<MT5>::value )
8582 ?( ( IsUpper<MT4>::value )
8583 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8584 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8585 :( IsUpper<MT4>::value ? i : 0UL ) );
8586 const size_t kend( ( IsUpper<MT5>::value )
8587 ?( ( IsLower<MT4>::value )
8588 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8589 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8590 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
8592 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8594 for(
size_t k=kbegin; k<kend; ++k ) {
8595 const IntrinsicType b1(
set( B(k,j) ) );
8596 xmm1 = xmm1 + A.load(i ,k) * b1;
8597 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
8598 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
8599 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
8600 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
8601 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
8602 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
8603 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
8606 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8608 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) + xmm3 * factor );
8609 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) + xmm4 * factor );
8610 (~C).store( i+
IT::size*4UL, j, (~C).load(i+
IT::size*4UL,j) + xmm5 * factor );
8611 (~C).store( i+
IT::size*5UL, j, (~C).load(i+
IT::size*5UL,j) + xmm6 * factor );
8612 (~C).store( i+
IT::size*6UL, j, (~C).load(i+
IT::size*6UL,j) + xmm7 * factor );
8613 (~C).store( i+
IT::size*7UL, j, (~C).load(i+
IT::size*7UL,j) + xmm8 * factor );
8621 for( ; (j+2UL) <= N; j+=2UL )
8623 const size_t kbegin( ( IsLower<MT5>::value )
8624 ?( ( IsUpper<MT4>::value )
8625 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8626 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8627 :( IsUpper<MT4>::value ? i : 0UL ) );
8628 const size_t kend( ( IsUpper<MT5>::value )
8629 ?( ( IsLower<MT4>::value )
8630 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8631 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8632 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
8634 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8636 for(
size_t k=kbegin; k<kend; ++k ) {
8637 const IntrinsicType a1( A.load(i ,k) );
8638 const IntrinsicType a2( A.load(i+
IT::size ,k) );
8639 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
8640 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
8641 const IntrinsicType b1(
set( B(k,j ) ) );
8642 const IntrinsicType b2(
set( B(k,j+1UL) ) );
8643 xmm1 = xmm1 + a1 * b1;
8644 xmm2 = xmm2 + a2 * b1;
8645 xmm3 = xmm3 + a3 * b1;
8646 xmm4 = xmm4 + a4 * b1;
8647 xmm5 = xmm5 + a1 * b2;
8648 xmm6 = xmm6 + a2 * b2;
8649 xmm7 = xmm7 + a3 * b2;
8650 xmm8 = xmm8 + a4 * b2;
8653 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8655 (~C).store( i+
IT::size*2UL, j , (~C).load(i+
IT::size*2UL,j ) + xmm3 * factor );
8656 (~C).store( i+
IT::size*3UL, j , (~C).load(i+
IT::size*3UL,j ) + xmm4 * factor );
8657 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8658 (~C).store( i+
IT::size , j+1UL, (~C).load(i+
IT::size ,j+1UL) + xmm6 * factor );
8659 (~C).store( i+
IT::size*2UL, j+1UL, (~C).load(i+
IT::size*2UL,j+1UL) + xmm7 * factor );
8660 (~C).store( i+
IT::size*3UL, j+1UL, (~C).load(i+
IT::size*3UL,j+1UL) + xmm8 * factor );
8665 const size_t kbegin( ( IsLower<MT5>::value )
8666 ?( ( IsUpper<MT4>::value )
8667 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8668 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8669 :( IsUpper<MT4>::value ? i : 0UL ) );
8670 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
8672 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8674 for(
size_t k=kbegin; k<kend; ++k ) {
8675 const IntrinsicType b1(
set( B(k,j) ) );
8676 xmm1 = xmm1 + A.load(i ,k) * b1;
8677 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
8678 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
8679 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
8682 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8684 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) + xmm3 * factor );
8685 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) + xmm4 * factor );
8693 for( ; (j+2UL) <= N; j+=2UL )
8695 const size_t kbegin( ( IsLower<MT5>::value )
8696 ?( ( IsUpper<MT4>::value )
8697 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8698 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8699 :( IsUpper<MT4>::value ? i : 0UL ) );
8700 const size_t kend( ( IsUpper<MT5>::value )
8701 ?( ( IsLower<MT4>::value )
8702 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8703 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8704 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
8706 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8708 for(
size_t k=kbegin; k<kend; ++k ) {
8709 const IntrinsicType a1( A.load(i ,k) );
8710 const IntrinsicType a2( A.load(i+
IT::size,k) );
8711 const IntrinsicType b1(
set( B(k,j ) ) );
8712 const IntrinsicType b2(
set( B(k,j+1UL) ) );
8713 xmm1 = xmm1 + a1 * b1;
8714 xmm2 = xmm2 + a2 * b1;
8715 xmm3 = xmm3 + a1 * b2;
8716 xmm4 = xmm4 + a2 * b2;
8719 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8721 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8722 (~C).store( i+
IT::size, j+1UL, (~C).load(i+
IT::size,j+1UL) + xmm4 * factor );
8727 const size_t kbegin( ( IsLower<MT5>::value )
8728 ?( ( IsUpper<MT4>::value )
8729 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8730 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8731 :( IsUpper<MT4>::value ? i : 0UL ) );
8732 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
8734 IntrinsicType xmm1, xmm2;
8736 for(
size_t k=kbegin; k<kend; ++k ) {
8737 const IntrinsicType b1(
set( B(k,j) ) );
8738 xmm1 = xmm1 + A.load(i ,k) * b1;
8739 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
8742 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8751 for( ; (j+2UL) <= N; j+=2UL )
8753 const size_t kbegin( ( IsLower<MT5>::value )
8754 ?( ( IsUpper<MT4>::value )
8755 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8756 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8757 :( IsUpper<MT4>::value ? i : 0UL ) );
8758 const size_t kend( ( IsUpper<MT5>::value )
8759 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8762 IntrinsicType xmm1, xmm2;
8764 for(
size_t k=kbegin; k<kend; ++k ) {
8765 const IntrinsicType a1( A.load(i,k) );
8766 xmm1 = xmm1 + a1 *
set( B(k,j ) );
8767 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
8770 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8771 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8776 const size_t kbegin( ( IsLower<MT5>::value )
8777 ?( ( IsUpper<MT4>::value )
8778 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8779 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8780 :( IsUpper<MT4>::value ? i : 0UL ) );
8784 for(
size_t k=kbegin; k<K; ++k ) {
8785 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
8788 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8792 for( ; remainder && i<M; ++i )
8796 for( ; (j+2UL) <= N; j+=2UL )
8798 const size_t kbegin( ( IsLower<MT5>::value )
8799 ?( ( IsUpper<MT4>::value )
8800 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8801 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8802 :( IsUpper<MT4>::value ? i : 0UL ) );
8803 const size_t kend( ( IsUpper<MT5>::value )
8804 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8810 for(
size_t k=kbegin; k<kend; ++k ) {
8811 value1 += A(i,k) * B(k,j );
8812 value2 += A(i,k) * B(k,j+1UL);
8815 (~C)(i,j ) += value1 * scalar;
8816 (~C)(i,j+1UL) += value2 * scalar;
8821 const size_t kbegin( ( IsLower<MT5>::value )
8822 ?( ( IsUpper<MT4>::value )
8823 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8824 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8825 :( IsUpper<MT4>::value ? i : 0UL ) );
8829 for(
size_t k=kbegin; k<K; ++k ) {
8830 value += A(i,k) * B(k,j);
8833 (~C)(i,j) += value * scalar;
8853 template<
typename MT3
8857 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8858 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8860 selectDefaultAddAssignKernel( C, A, B, scalar );
8879 template<
typename MT3
8883 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8884 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8886 typedef IntrinsicTrait<ElementType> IT;
8888 const size_t M( A.rows() );
8889 const size_t N( B.columns() );
8890 const size_t K( A.columns() );
8892 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8894 const IntrinsicType factor(
set( scalar ) );
8896 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
8898 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
8900 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
8903 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
8905 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
8907 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
8909 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
8921 for( ; (i+2UL) <= iend; i+=2UL )
8923 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8924 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8925 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8926 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
8928 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8930 for(
size_t k=kbegin; k<kend; ++k ) {
8931 const IntrinsicType a1(
set( A(i ,k) ) );
8932 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8933 const IntrinsicType b1( B.load(k,j ) );
8934 const IntrinsicType b2( B.load(k,j1) );
8935 const IntrinsicType b3( B.load(k,j2) );
8936 const IntrinsicType b4( B.load(k,j3) );
8937 xmm1 = xmm1 + a1 * b1;
8938 xmm2 = xmm2 + a1 * b2;
8939 xmm3 = xmm3 + a1 * b3;
8940 xmm4 = xmm4 + a1 * b4;
8941 xmm5 = xmm5 + a2 * b1;
8942 xmm6 = xmm6 + a2 * b2;
8943 xmm7 = xmm7 + a2 * b3;
8944 xmm8 = xmm8 + a2 * b4;
8947 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8948 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8949 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8950 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8951 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8952 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8953 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8954 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8959 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8960 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8961 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8962 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
8964 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8966 for(
size_t k=kbegin; k<kend; ++k ) {
8967 const IntrinsicType a1(
set( A(i,k) ) );
8968 xmm1 = xmm1 + a1 * B.load(k,j );
8969 xmm2 = xmm2 + a1 * B.load(k,j1);
8970 xmm3 = xmm3 + a1 * B.load(k,j2);
8971 xmm4 = xmm4 + a1 * B.load(k,j3);
8974 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8975 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8976 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8977 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8987 for( ; (i+4UL) <= iend; i+=4UL )
8989 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8990 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8991 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8992 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
8994 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8996 for(
size_t k=kbegin; k<kend; ++k ) {
8997 const IntrinsicType a1(
set( A(i ,k) ) );
8998 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8999 const IntrinsicType a3(
set( A(i+2UL,k) ) );
9000 const IntrinsicType a4(
set( A(i+3UL,k) ) );
9001 const IntrinsicType b1( B.load(k,j ) );
9002 const IntrinsicType b2( B.load(k,j1) );
9003 xmm1 = xmm1 + a1 * b1;
9004 xmm2 = xmm2 + a1 * b2;
9005 xmm3 = xmm3 + a2 * b1;
9006 xmm4 = xmm4 + a2 * b2;
9007 xmm5 = xmm5 + a3 * b1;
9008 xmm6 = xmm6 + a3 * b2;
9009 xmm7 = xmm7 + a4 * b1;
9010 xmm8 = xmm8 + a4 * b2;
9013 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9014 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9015 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9016 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9017 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
9018 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
9019 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
9020 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
9023 for( ; (i+2UL) <= iend; i+=2UL )
9025 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9026 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9027 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
9028 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
9030 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9032 for(
size_t k=kbegin; k<kend; ++k ) {
9033 const IntrinsicType a1(
set( A(i ,k) ) );
9034 const IntrinsicType a2(
set( A(i+1UL,k) ) );
9035 const IntrinsicType b1( B.load(k,j ) );
9036 const IntrinsicType b2( B.load(k,j1) );
9037 xmm1 = xmm1 + a1 * b1;
9038 xmm2 = xmm2 + a1 * b2;
9039 xmm3 = xmm3 + a2 * b1;
9040 xmm4 = xmm4 + a2 * b2;
9043 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9044 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9045 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9046 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9051 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9052 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9053 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9054 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
9056 IntrinsicType xmm1, xmm2;
9058 for(
size_t k=kbegin; k<kend; ++k ) {
9059 const IntrinsicType a1(
set( A(i,k) ) );
9060 xmm1 = xmm1 + a1 * B.load(k,j );
9061 xmm2 = xmm2 + a1 * B.load(k,j1);
9064 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9065 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9071 for(
size_t i=ii; i<iend; ++i )
9073 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9074 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9075 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9076 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
9080 for(
size_t k=kbegin; k<kend; ++k ) {
9081 const IntrinsicType a1(
set( A(i,k) ) );
9082 xmm1 = xmm1 + a1 * B.load(k,j);
9085 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9089 for( ; remainder && j<jend; ++j )
9091 for(
size_t i=ii; i<iend; ++i )
9093 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9094 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9095 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9096 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
9100 for(
size_t k=kbegin; k<kend; ++k ) {
9101 value += A(i,k) * B(k,j);
9104 (~C)(i,j) += value * scalar;
9128 template<
typename MT3
9132 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9133 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9135 typedef IntrinsicTrait<ElementType> IT;
9137 const size_t M( A.rows() );
9138 const size_t N( B.columns() );
9139 const size_t K( A.columns() );
9141 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
9143 const IntrinsicType factor(
set( scalar ) );
9145 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
9147 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
9149 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
9152 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
9154 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
9156 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
9158 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
9170 for( ; (j+2UL) <= jend; j+=2UL )
9172 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9173 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9174 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
9175 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9177 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9179 for(
size_t k=kbegin; k<kend; ++k ) {
9180 const IntrinsicType a1( A.load(i ,k) );
9181 const IntrinsicType a2( A.load(i1,k) );
9182 const IntrinsicType a3( A.load(i2,k) );
9183 const IntrinsicType a4( A.load(i3,k) );
9184 const IntrinsicType b1(
set( B(k,j ) ) );
9185 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9186 xmm1 = xmm1 + a1 * b1;
9187 xmm2 = xmm2 + a2 * b1;
9188 xmm3 = xmm3 + a3 * b1;
9189 xmm4 = xmm4 + a4 * b1;
9190 xmm5 = xmm5 + a1 * b2;
9191 xmm6 = xmm6 + a2 * b2;
9192 xmm7 = xmm7 + a3 * b2;
9193 xmm8 = xmm8 + a4 * b2;
9196 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9197 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9198 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
9199 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
9200 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
9201 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
9202 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
9203 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
9208 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9209 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9210 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
9211 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9213 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9215 for(
size_t k=kbegin; k<kend; ++k ) {
9216 const IntrinsicType b1(
set( B(k,j) ) );
9217 xmm1 = xmm1 + A.load(i ,k) * b1;
9218 xmm2 = xmm2 + A.load(i1,k) * b1;
9219 xmm3 = xmm3 + A.load(i2,k) * b1;
9220 xmm4 = xmm4 + A.load(i3,k) * b1;
9223 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9224 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9225 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
9226 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
9236 for( ; (j+4UL) <= jend; j+=4UL )
9238 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9239 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9240 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9241 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9243 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9245 for(
size_t k=kbegin; k<kend; ++k ) {
9246 const IntrinsicType a1( A.load(i ,k) );
9247 const IntrinsicType a2( A.load(i1,k) );
9248 const IntrinsicType b1(
set( B(k,j ) ) );
9249 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9250 const IntrinsicType b3(
set( B(k,j+2UL) ) );
9251 const IntrinsicType b4(
set( B(k,j+3UL) ) );
9252 xmm1 = xmm1 + a1 * b1;
9253 xmm2 = xmm2 + a2 * b1;
9254 xmm3 = xmm3 + a1 * b2;
9255 xmm4 = xmm4 + a2 * b2;
9256 xmm5 = xmm5 + a1 * b3;
9257 xmm6 = xmm6 + a2 * b3;
9258 xmm7 = xmm7 + a1 * b4;
9259 xmm8 = xmm8 + a2 * b4;
9262 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9263 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9264 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9265 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9266 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
9267 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
9268 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
9269 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
9272 for( ; (j+2UL) <= jend; j+=2UL )
9274 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9275 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9276 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9277 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9279 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9281 for(
size_t k=kbegin; k<kend; ++k ) {
9282 const IntrinsicType a1( A.load(i ,k) );
9283 const IntrinsicType a2( A.load(i1,k) );
9284 const IntrinsicType b1(
set( B(k,j ) ) );
9285 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9286 xmm1 = xmm1 + a1 * b1;
9287 xmm2 = xmm2 + a2 * b1;
9288 xmm3 = xmm3 + a1 * b2;
9289 xmm4 = xmm4 + a2 * b2;
9292 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9293 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9294 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9295 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9300 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9301 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9302 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9303 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9305 IntrinsicType xmm1, xmm2;
9307 for(
size_t k=kbegin; k<kend; ++k ) {
9308 const IntrinsicType b1(
set( B(k,j) ) );
9309 xmm1 = xmm1 + A.load(i ,k) * b1;
9310 xmm2 = xmm2 + A.load(i1,k) * b1;
9313 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9314 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9320 for(
size_t j=jj; j<jend; ++j )
9322 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9323 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9324 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
9325 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9329 for(
size_t k=kbegin; k<kend; ++k ) {
9330 const IntrinsicType b1(
set( B(k,j) ) );
9331 xmm1 = xmm1 + A.load(i,k) * b1;
9334 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9338 for( ; remainder && i<iend; ++i )
9340 for(
size_t j=jj; j<jend; ++j )
9342 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9343 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9344 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
9345 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9349 for(
size_t k=kbegin; k<kend; ++k ) {
9350 value += A(i,k) * B(k,j);
9353 (~C)(i,j) += value * scalar;
9376 template<
typename MT3
9380 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
9381 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9383 selectLargeAddAssignKernel( C, A, B, scalar );
9402 template<
typename MT3
9406 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
9407 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9411 if( IsTriangular<MT4>::value ) {
9413 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9414 addAssign( C, tmp );
9416 else if( IsTriangular<MT5>::value ) {
9418 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9419 addAssign( C, tmp );
9422 gemm( C, A, B, ET(scalar), ET(1) );
9444 template<
typename MT
9446 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9453 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
9454 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
9456 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9470 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9485 template<
typename MT3
9489 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9491 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9492 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9493 selectSmallSubAssignKernel( C, A, B, scalar );
9495 selectBlasSubAssignKernel( C, A, B, scalar );
9513 template<
typename MT3
9517 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
9518 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9520 const ResultType tmp(
serial( A * B * scalar ) );
9521 subAssign( C, tmp );
9539 template<
typename MT3
9543 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9544 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9546 const size_t M( A.rows() );
9547 const size_t N( B.columns() );
9549 const size_t block( BLOCK_SIZE );
9551 for(
size_t ii=0UL; ii<M; ii+=block ) {
9552 const size_t iend(
min( M, ii+block ) );
9553 for(
size_t jj=0UL; jj<N; jj+=block ) {
9554 const size_t jend(
min( N, jj+block ) );
9555 for(
size_t i=ii; i<iend; ++i )
9557 const size_t jbegin( ( IsUpper<MT4>::value )
9558 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9560 const size_t jpos( ( IsLower<MT4>::value )
9561 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9564 for(
size_t j=jbegin; j<jpos; ++j ) {
9565 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9587 template<
typename MT3
9591 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9592 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9594 const size_t M( A.rows() );
9595 const size_t N( B.columns() );
9597 for(
size_t j=0UL; j<N; ++j )
9599 const size_t ibegin( ( IsLower<MT4>::value )
9600 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9602 const size_t iend( ( IsUpper<MT4>::value )
9603 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9607 const size_t inum( iend - ibegin );
9608 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9610 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9611 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9612 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9615 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9635 template<
typename MT3
9639 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9640 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9642 const size_t M( A.rows() );
9643 const size_t N( B.columns() );
9645 for(
size_t i=0UL; i<M; ++i )
9647 const size_t jbegin( ( IsUpper<MT5>::value )
9648 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9650 const size_t jend( ( IsLower<MT5>::value )
9651 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9655 const size_t jnum( jend - jbegin );
9656 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9658 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9659 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9660 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9663 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9683 template<
typename MT3
9687 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9688 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9690 const size_t M( A.rows() );
9691 const size_t N( B.columns() );
9693 const size_t block( BLOCK_SIZE );
9695 for(
size_t jj=0UL; jj<N; jj+=block ) {
9696 const size_t jend(
min( N, jj+block ) );
9697 for(
size_t ii=0UL; ii<M; ii+=block ) {
9698 const size_t iend(
min( M, ii+block ) );
9699 for(
size_t j=jj; j<jend; ++j )
9701 const size_t ibegin( ( IsLower<MT5>::value )
9702 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9704 const size_t ipos( ( IsUpper<MT5>::value )
9705 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9708 for(
size_t i=ibegin; i<ipos; ++i ) {
9709 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9731 template<
typename MT3
9735 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
9736 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9738 for(
size_t i=0UL; i<A.rows(); ++i ) {
9739 C(i,i) -= A(i,i) * B(i,i) * scalar;
9758 template<
typename MT3
9762 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9763 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9765 selectDefaultSubAssignKernel( C, A, B, scalar );
9784 template<
typename MT3
9788 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9789 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9791 typedef IntrinsicTrait<ElementType> IT;
9793 const size_t M( A.rows() );
9794 const size_t N( B.columns() );
9795 const size_t K( A.columns() );
9797 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9799 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
9802 const IntrinsicType factor(
set( scalar ) );
9807 for(
size_t i=0UL; i<M; ++i )
9809 const size_t kbegin( ( IsUpper<MT4>::value )
9810 ?( ( IsLower<MT5>::value )
9811 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9812 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9813 :( IsLower<MT5>::value ? j : 0UL ) );
9814 const size_t kend( ( IsLower<MT4>::value )
9815 ?( ( IsUpper<MT5>::value )
9816 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
9817 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9818 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
9820 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9822 for(
size_t k=kbegin; k<kend; ++k ) {
9823 const IntrinsicType a1(
set( A(i,k) ) );
9824 xmm1 = xmm1 + a1 * B.load(k,j );
9825 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
9826 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
9827 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
9828 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
9829 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
9830 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
9831 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
9834 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9836 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) - xmm3 * factor );
9837 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) - xmm4 * factor );
9838 (~C).store( i, j+
IT::size*4UL, (~C).load(i,j+
IT::size*4UL) - xmm5 * factor );
9839 (~C).store( i, j+
IT::size*5UL, (~C).load(i,j+
IT::size*5UL) - xmm6 * factor );
9840 (~C).store( i, j+
IT::size*6UL, (~C).load(i,j+
IT::size*6UL) - xmm7 * factor );
9841 (~C).store( i, j+
IT::size*7UL, (~C).load(i,j+
IT::size*7UL) - xmm8 * factor );
9849 for( ; (i+2UL) <= M; i+=2UL )
9851 const size_t kbegin( ( IsUpper<MT4>::value )
9852 ?( ( IsLower<MT5>::value )
9853 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9854 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9855 :( IsLower<MT5>::value ? j : 0UL ) );
9856 const size_t kend( ( IsLower<MT4>::value )
9857 ?( ( IsUpper<MT5>::value )
9858 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
9859 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9860 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
9862 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9864 for(
size_t k=kbegin; k<kend; ++k ) {
9865 const IntrinsicType a1(
set( A(i ,k) ) );
9866 const IntrinsicType a2(
set( A(i+1UL,k) ) );
9867 const IntrinsicType b1( B.load(k,j ) );
9868 const IntrinsicType b2( B.load(k,j+
IT::size ) );
9869 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
9870 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
9871 xmm1 = xmm1 + a1 * b1;
9872 xmm2 = xmm2 + a1 * b2;
9873 xmm3 = xmm3 + a1 * b3;
9874 xmm4 = xmm4 + a1 * b4;
9875 xmm5 = xmm5 + a2 * b1;
9876 xmm6 = xmm6 + a2 * b2;
9877 xmm7 = xmm7 + a2 * b3;
9878 xmm8 = xmm8 + a2 * b4;
9881 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9883 (~C).store( i , j+
IT::size*2UL, (~C).load(i ,j+
IT::size*2UL) - xmm3 * factor );
9884 (~C).store( i , j+
IT::size*3UL, (~C).load(i ,j+
IT::size*3UL) - xmm4 * factor );
9885 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9886 (~C).store( i+1UL, j+
IT::size , (~C).load(i+1UL,j+
IT::size ) - xmm6 * factor );
9887 (~C).store( i+1UL, j+
IT::size*2UL, (~C).load(i+1UL,j+
IT::size*2UL) - xmm7 * factor );
9888 (~C).store( i+1UL, j+
IT::size*3UL, (~C).load(i+1UL,j+
IT::size*3UL) - xmm8 * factor );
9893 const size_t kbegin( ( IsUpper<MT4>::value )
9894 ?( ( IsLower<MT5>::value )
9895 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9896 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9897 :( IsLower<MT5>::value ? j : 0UL ) );
9898 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
9900 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9902 for(
size_t k=kbegin; k<kend; ++k ) {
9903 const IntrinsicType a1(
set( A(i,k) ) );
9904 xmm1 = xmm1 + a1 * B.load(k,j );
9905 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
9906 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
9907 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
9910 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9912 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) - xmm3 * factor );
9913 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) - xmm4 * factor );
9921 for( ; (i+2UL) <= M; i+=2UL )
9923 const size_t kbegin( ( IsUpper<MT4>::value )
9924 ?( ( IsLower<MT5>::value )
9925 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9926 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9927 :( IsLower<MT5>::value ? j : 0UL ) );
9928 const size_t kend( ( IsLower<MT4>::value )
9929 ?( ( IsUpper<MT5>::value )
9930 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
9931 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9932 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
9934 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9936 for(
size_t k=kbegin; k<kend; ++k ) {
9937 const IntrinsicType a1(
set( A(i ,k) ) );
9938 const IntrinsicType a2(
set( A(i+1UL,k) ) );
9939 const IntrinsicType b1( B.load(k,j ) );
9940 const IntrinsicType b2( B.load(k,j+
IT::size) );
9941 xmm1 = xmm1 + a1 * b1;
9942 xmm2 = xmm2 + a1 * b2;
9943 xmm3 = xmm3 + a2 * b1;
9944 xmm4 = xmm4 + a2 * b2;
9947 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9949 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9950 (~C).store( i+1UL, j+
IT::size, (~C).load(i+1UL,j+
IT::size) - xmm4 * factor );
9955 const size_t kbegin( ( IsUpper<MT4>::value )
9956 ?( ( IsLower<MT5>::value )
9957 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9958 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9959 :( IsLower<MT5>::value ? j : 0UL ) );
9960 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
9962 IntrinsicType xmm1, xmm2;
9964 for(
size_t k=kbegin; k<kend; ++k ) {
9965 const IntrinsicType a1(
set( A(i,k) ) );
9966 xmm1 = xmm1 + a1 * B.load(k,j );
9967 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
9970 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9979 for( ; (i+2UL) <= M; i+=2UL )
9981 const size_t kbegin( ( IsUpper<MT4>::value )
9982 ?( ( IsLower<MT5>::value )
9983 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9984 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9985 :( IsLower<MT5>::value ? j : 0UL ) );
9986 const size_t kend( ( IsLower<MT4>::value )
9987 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9990 IntrinsicType xmm1, xmm2;
9992 for(
size_t k=kbegin; k<kend; ++k ) {
9993 const IntrinsicType b1( B.load(k,j) );
9994 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
9995 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
9998 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9999 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
10004 const size_t kbegin( ( IsUpper<MT4>::value )
10005 ?( ( IsLower<MT5>::value )
10006 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10007 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10008 :( IsLower<MT5>::value ? j : 0UL ) );
10010 IntrinsicType xmm1;
10012 for(
size_t k=kbegin; k<K; ++k ) {
10013 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
10016 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10020 for( ; remainder && j<N; ++j )
10024 for( ; (i+2UL) <= M; i+=2UL )
10026 const size_t kbegin( ( IsUpper<MT4>::value )
10027 ?( ( IsLower<MT5>::value )
10028 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10029 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10030 :( IsLower<MT5>::value ? j : 0UL ) );
10031 const size_t kend( ( IsLower<MT4>::value )
10032 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10038 for(
size_t k=kbegin; k<kend; ++k ) {
10039 value1 += A(i ,k) * B(k,j);
10040 value2 += A(i+1UL,k) * B(k,j);
10043 (~C)(i ,j) -= value1 * scalar;
10044 (~C)(i+1UL,j) -= value2 * scalar;
10049 const size_t kbegin( ( IsUpper<MT4>::value )
10050 ?( ( IsLower<MT5>::value )
10051 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10052 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10053 :( IsLower<MT5>::value ? j : 0UL ) );
10057 for(
size_t k=kbegin; k<K; ++k ) {
10058 value += A(i,k) * B(k,j);
10061 (~C)(i,j) -= value * scalar;
10082 template<
typename MT3
10086 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10087 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10089 typedef IntrinsicTrait<ElementType> IT;
10091 const size_t M( A.rows() );
10092 const size_t N( B.columns() );
10093 const size_t K( A.columns() );
10095 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10097 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
10100 const IntrinsicType factor(
set( scalar ) );
10105 for(
size_t j=0UL; j<N; ++j )
10107 const size_t kbegin( ( IsLower<MT5>::value )
10108 ?( ( IsUpper<MT4>::value )
10109 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10110 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10111 :( IsUpper<MT4>::value ? i : 0UL ) );
10112 const size_t kend( ( IsUpper<MT5>::value )
10113 ?( ( IsLower<MT4>::value )
10114 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10115 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10116 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
10118 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10120 for(
size_t k=kbegin; k<kend; ++k ) {
10121 const IntrinsicType b1(
set( B(k,j) ) );
10122 xmm1 = xmm1 + A.load(i ,k) * b1;
10123 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
10124 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
10125 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
10126 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
10127 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
10128 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
10129 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
10132 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10134 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) - xmm3 * factor );
10135 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) - xmm4 * factor );
10136 (~C).store( i+
IT::size*4UL, j, (~C).load(i+
IT::size*4UL,j) - xmm5 * factor );
10137 (~C).store( i+
IT::size*5UL, j, (~C).load(i+
IT::size*5UL,j) - xmm6 * factor );
10138 (~C).store( i+
IT::size*6UL, j, (~C).load(i+
IT::size*6UL,j) - xmm7 * factor );
10139 (~C).store( i+
IT::size*7UL, j, (~C).load(i+
IT::size*7UL,j) - xmm8 * factor );
10147 for( ; (j+2UL) <= N; j+=2UL )
10149 const size_t kbegin( ( IsLower<MT5>::value )
10150 ?( ( IsUpper<MT4>::value )
10151 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10152 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10153 :( IsUpper<MT4>::value ? i : 0UL ) );
10154 const size_t kend( ( IsUpper<MT5>::value )
10155 ?( ( IsLower<MT4>::value )
10156 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10157 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10158 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
10160 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10162 for(
size_t k=kbegin; k<kend; ++k ) {
10163 const IntrinsicType a1( A.load(i ,k) );
10164 const IntrinsicType a2( A.load(i+
IT::size ,k) );
10165 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
10166 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
10167 const IntrinsicType b1(
set( B(k,j ) ) );
10168 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10169 xmm1 = xmm1 + a1 * b1;
10170 xmm2 = xmm2 + a2 * b1;
10171 xmm3 = xmm3 + a3 * b1;
10172 xmm4 = xmm4 + a4 * b1;
10173 xmm5 = xmm5 + a1 * b2;
10174 xmm6 = xmm6 + a2 * b2;
10175 xmm7 = xmm7 + a3 * b2;
10176 xmm8 = xmm8 + a4 * b2;
10179 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10181 (~C).store( i+
IT::size*2UL, j , (~C).load(i+
IT::size*2UL,j ) - xmm3 * factor );
10182 (~C).store( i+
IT::size*3UL, j , (~C).load(i+
IT::size*3UL,j ) - xmm4 * factor );
10183 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10184 (~C).store( i+
IT::size , j+1UL, (~C).load(i+
IT::size ,j+1UL) - xmm6 * factor );
10185 (~C).store( i+
IT::size*2UL, j+1UL, (~C).load(i+
IT::size*2UL,j+1UL) - xmm7 * factor );
10186 (~C).store( i+
IT::size*3UL, j+1UL, (~C).load(i+
IT::size*3UL,j+1UL) - xmm8 * factor );
10191 const size_t kbegin( ( IsLower<MT5>::value )
10192 ?( ( IsUpper<MT4>::value )
10193 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10194 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10195 :( IsUpper<MT4>::value ? i : 0UL ) );
10196 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
10198 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10200 for(
size_t k=kbegin; k<kend; ++k ) {
10201 const IntrinsicType b1(
set( B(k,j) ) );
10202 xmm1 = xmm1 + A.load(i ,k) * b1;
10203 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
10204 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
10205 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
10208 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10210 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) - xmm3 * factor );
10211 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) - xmm4 * factor );
10219 for( ; (j+2UL) <= N; j+=2UL )
10221 const size_t kbegin( ( IsLower<MT5>::value )
10222 ?( ( IsUpper<MT4>::value )
10223 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10224 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10225 :( IsUpper<MT4>::value ? i : 0UL ) );
10226 const size_t kend( ( IsUpper<MT5>::value )
10227 ?( ( IsLower<MT4>::value )
10228 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10229 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10230 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
10232 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10234 for(
size_t k=kbegin; k<kend; ++k ) {
10235 const IntrinsicType a1( A.load(i ,k) );
10236 const IntrinsicType a2( A.load(i+
IT::size,k) );
10237 const IntrinsicType b1(
set( B(k,j ) ) );
10238 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10239 xmm1 = xmm1 + a1 * b1;
10240 xmm2 = xmm2 + a2 * b1;
10241 xmm3 = xmm3 + a1 * b2;
10242 xmm4 = xmm4 + a2 * b2;
10245 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10247 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10248 (~C).store( i+
IT::size, j+1UL, (~C).load(i+
IT::size,j+1UL) - xmm4 * factor );
10253 const size_t kbegin( ( IsLower<MT5>::value )
10254 ?( ( IsUpper<MT4>::value )
10255 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10256 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10257 :( IsUpper<MT4>::value ? i : 0UL ) );
10258 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
10260 IntrinsicType xmm1, xmm2;
10262 for(
size_t k=kbegin; k<kend; ++k ) {
10263 const IntrinsicType b1(
set( B(k,j) ) );
10264 xmm1 = xmm1 + A.load(i ,k) * b1;
10265 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
10268 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10277 for( ; (j+2UL) <= N; j+=2UL )
10279 const size_t kbegin( ( IsLower<MT5>::value )
10280 ?( ( IsUpper<MT4>::value )
10281 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10282 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10283 :( IsUpper<MT4>::value ? i : 0UL ) );
10284 const size_t kend( ( IsUpper<MT5>::value )
10285 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10288 IntrinsicType xmm1, xmm2;
10290 for(
size_t k=kbegin; k<kend; ++k ) {
10291 const IntrinsicType a1( A.load(i,k) );
10292 xmm1 = xmm1 + a1 *
set( B(k,j ) );
10293 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
10296 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10297 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10302 const size_t kbegin( ( IsLower<MT5>::value )
10303 ?( ( IsUpper<MT4>::value )
10304 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10305 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10306 :( IsUpper<MT4>::value ? i : 0UL ) );
10308 IntrinsicType xmm1;
10310 for(
size_t k=kbegin; k<K; ++k ) {
10311 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
10314 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10318 for( ; remainder && i<M; ++i )
10322 for( ; (j+2UL) <= N; j+=2UL )
10324 const size_t kbegin( ( IsLower<MT5>::value )
10325 ?( ( IsUpper<MT4>::value )
10326 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10327 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10328 :( IsUpper<MT4>::value ? i : 0UL ) );
10329 const size_t kend( ( IsUpper<MT5>::value )
10330 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10336 for(
size_t k=kbegin; k<kend; ++k ) {
10337 value1 += A(i,k) * B(k,j );
10338 value2 += A(i,k) * B(k,j+1UL);
10341 (~C)(i,j ) -= value1 * scalar;
10342 (~C)(i,j+1UL) -= value2 * scalar;
10347 const size_t kbegin( ( IsLower<MT5>::value )
10348 ?( ( IsUpper<MT4>::value )
10349 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10350 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10351 :( IsUpper<MT4>::value ? i : 0UL ) );
10355 for(
size_t k=kbegin; k<K; ++k ) {
10356 value += A(i,k) * B(k,j);
10359 (~C)(i,j) -= value * scalar;
10379 template<
typename MT3
10383 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10384 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10386 selectDefaultSubAssignKernel( C, A, B, scalar );
10405 template<
typename MT3
10409 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10410 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10412 typedef IntrinsicTrait<ElementType> IT;
10414 const size_t M( A.rows() );
10415 const size_t N( B.columns() );
10416 const size_t K( A.columns() );
10418 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
10420 const IntrinsicType factor(
set( scalar ) );
10422 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
10424 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10426 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
10429 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
10431 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10433 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
10435 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10442 const size_t j2( j+
IT::size*2UL );
10443 const size_t j3( j+
IT::size*3UL );
10447 for( ; (i+2UL) <= iend; i+=2UL )
10449 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10450 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10451 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10452 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
10454 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10456 for(
size_t k=kbegin; k<kend; ++k ) {
10457 const IntrinsicType a1(
set( A(i ,k) ) );
10458 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10459 const IntrinsicType b1( B.load(k,j ) );
10460 const IntrinsicType b2( B.load(k,j1) );
10461 const IntrinsicType b3( B.load(k,j2) );
10462 const IntrinsicType b4( B.load(k,j3) );
10463 xmm1 = xmm1 + a1 * b1;
10464 xmm2 = xmm2 + a1 * b2;
10465 xmm3 = xmm3 + a1 * b3;
10466 xmm4 = xmm4 + a1 * b4;
10467 xmm5 = xmm5 + a2 * b1;
10468 xmm6 = xmm6 + a2 * b2;
10469 xmm7 = xmm7 + a2 * b3;
10470 xmm8 = xmm8 + a2 * b4;
10473 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10474 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10475 (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10476 (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10477 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10478 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10479 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10480 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10485 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10486 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10487 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10488 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
10490 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10492 for(
size_t k=kbegin; k<kend; ++k ) {
10493 const IntrinsicType a1(
set( A(i,k) ) );
10494 xmm1 = xmm1 + a1 * B.load(k,j );
10495 xmm2 = xmm2 + a1 * B.load(k,j1);
10496 xmm3 = xmm3 + a1 * B.load(k,j2);
10497 xmm4 = xmm4 + a1 * B.load(k,j3);
10500 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10501 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10502 (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10503 (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10513 for( ; (i+4UL) <= iend; i+=4UL )
10515 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10516 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10517 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10518 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10520 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10522 for(
size_t k=kbegin; k<kend; ++k ) {
10523 const IntrinsicType a1(
set( A(i ,k) ) );
10524 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10525 const IntrinsicType a3(
set( A(i+2UL,k) ) );
10526 const IntrinsicType a4(
set( A(i+3UL,k) ) );
10527 const IntrinsicType b1( B.load(k,j ) );
10528 const IntrinsicType b2( B.load(k,j1) );
10529 xmm1 = xmm1 + a1 * b1;
10530 xmm2 = xmm2 + a1 * b2;
10531 xmm3 = xmm3 + a2 * b1;
10532 xmm4 = xmm4 + a2 * b2;
10533 xmm5 = xmm5 + a3 * b1;
10534 xmm6 = xmm6 + a3 * b2;
10535 xmm7 = xmm7 + a4 * b1;
10536 xmm8 = xmm8 + a4 * b2;
10539 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10540 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10541 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10542 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10543 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10544 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10545 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10546 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10549 for( ; (i+2UL) <= iend; i+=2UL )
10551 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10552 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10553 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10554 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10556 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10558 for(
size_t k=kbegin; k<kend; ++k ) {
10559 const IntrinsicType a1(
set( A(i ,k) ) );
10560 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10561 const IntrinsicType b1( B.load(k,j ) );
10562 const IntrinsicType b2( B.load(k,j1) );
10563 xmm1 = xmm1 + a1 * b1;
10564 xmm2 = xmm2 + a1 * b2;
10565 xmm3 = xmm3 + a2 * b1;
10566 xmm4 = xmm4 + a2 * b2;
10569 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10570 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10571 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10572 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10577 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10578 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10579 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10580 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10582 IntrinsicType xmm1, xmm2;
10584 for(
size_t k=kbegin; k<kend; ++k ) {
10585 const IntrinsicType a1(
set( A(i,k) ) );
10586 xmm1 = xmm1 + a1 * B.load(k,j );
10587 xmm2 = xmm2 + a1 * B.load(k,j1);
10590 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10591 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10597 for(
size_t i=ii; i<iend; ++i )
10599 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10600 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10601 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10602 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
10604 IntrinsicType xmm1;
10606 for(
size_t k=kbegin; k<kend; ++k ) {
10607 const IntrinsicType a1(
set( A(i,k) ) );
10608 xmm1 = xmm1 + a1 * B.load(k,j);
10611 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10615 for( ; remainder && j<jend; ++j )
10617 for(
size_t i=ii; i<iend; ++i )
10619 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10620 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10621 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10622 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
10626 for(
size_t k=kbegin; k<kend; ++k ) {
10627 value += A(i,k) * B(k,j);
10630 (~C)(i,j) -= value * scalar;
10654 template<
typename MT3
10658 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10659 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10661 typedef IntrinsicTrait<ElementType> IT;
10663 const size_t M( A.rows() );
10664 const size_t N( B.columns() );
10665 const size_t K( A.columns() );
10667 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10669 const IntrinsicType factor(
set( scalar ) );
10671 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
10673 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10675 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
10678 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
10680 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10682 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
10684 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10691 const size_t i2( i+
IT::size*2UL );
10692 const size_t i3( i+
IT::size*3UL );
10696 for( ; (j+2UL) <= jend; j+=2UL )
10698 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10699 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10700 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
10701 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10703 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10705 for(
size_t k=kbegin; k<kend; ++k ) {
10706 const IntrinsicType a1( A.load(i ,k) );
10707 const IntrinsicType a2( A.load(i1,k) );
10708 const IntrinsicType a3( A.load(i2,k) );
10709 const IntrinsicType a4( A.load(i3,k) );
10710 const IntrinsicType b1(
set( B(k,j ) ) );
10711 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10712 xmm1 = xmm1 + a1 * b1;
10713 xmm2 = xmm2 + a2 * b1;
10714 xmm3 = xmm3 + a3 * b1;
10715 xmm4 = xmm4 + a4 * b1;
10716 xmm5 = xmm5 + a1 * b2;
10717 xmm6 = xmm6 + a2 * b2;
10718 xmm7 = xmm7 + a3 * b2;
10719 xmm8 = xmm8 + a4 * b2;
10722 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10723 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10724 (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
10725 (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
10726 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10727 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
10728 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
10729 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
10734 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10735 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10736 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
10737 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10739 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10741 for(
size_t k=kbegin; k<kend; ++k ) {
10742 const IntrinsicType b1(
set( B(k,j) ) );
10743 xmm1 = xmm1 + A.load(i ,k) * b1;
10744 xmm2 = xmm2 + A.load(i1,k) * b1;
10745 xmm3 = xmm3 + A.load(i2,k) * b1;
10746 xmm4 = xmm4 + A.load(i3,k) * b1;
10749 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10750 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10751 (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
10752 (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
10762 for( ; (j+4UL) <= jend; j+=4UL )
10764 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10765 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10766 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10767 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10769 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10771 for(
size_t k=kbegin; k<kend; ++k ) {
10772 const IntrinsicType a1( A.load(i ,k) );
10773 const IntrinsicType a2( A.load(i1,k) );
10774 const IntrinsicType b1(
set( B(k,j ) ) );
10775 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10776 const IntrinsicType b3(
set( B(k,j+2UL) ) );
10777 const IntrinsicType b4(
set( B(k,j+3UL) ) );
10778 xmm1 = xmm1 + a1 * b1;
10779 xmm2 = xmm2 + a2 * b1;
10780 xmm3 = xmm3 + a1 * b2;
10781 xmm4 = xmm4 + a2 * b2;
10782 xmm5 = xmm5 + a1 * b3;
10783 xmm6 = xmm6 + a2 * b3;
10784 xmm7 = xmm7 + a1 * b4;
10785 xmm8 = xmm8 + a2 * b4;
10788 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10789 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10790 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10791 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10792 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
10793 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
10794 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
10795 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
10798 for( ; (j+2UL) <= jend; j+=2UL )
10800 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10801 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10802 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10803 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10805 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10807 for(
size_t k=kbegin; k<kend; ++k ) {
10808 const IntrinsicType a1( A.load(i ,k) );
10809 const IntrinsicType a2( A.load(i1,k) );
10810 const IntrinsicType b1(
set( B(k,j ) ) );
10811 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10812 xmm1 = xmm1 + a1 * b1;
10813 xmm2 = xmm2 + a2 * b1;
10814 xmm3 = xmm3 + a1 * b2;
10815 xmm4 = xmm4 + a2 * b2;
10818 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10819 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10820 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10821 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10826 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10827 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10828 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10829 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10831 IntrinsicType xmm1, xmm2;
10833 for(
size_t k=kbegin; k<kend; ++k ) {
10834 const IntrinsicType b1(
set( B(k,j) ) );
10835 xmm1 = xmm1 + A.load(i ,k) * b1;
10836 xmm2 = xmm2 + A.load(i1,k) * b1;
10839 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10840 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10846 for(
size_t j=jj; j<jend; ++j )
10848 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10849 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10850 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
10851 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10853 IntrinsicType xmm1;
10855 for(
size_t k=kbegin; k<kend; ++k ) {
10856 const IntrinsicType b1(
set( B(k,j) ) );
10857 xmm1 = xmm1 + A.load(i,k) * b1;
10860 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10864 for( ; remainder && i<iend; ++i )
10866 for(
size_t j=jj; j<jend; ++j )
10868 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10869 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10870 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
10871 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10875 for(
size_t k=kbegin; k<kend; ++k ) {
10876 value += A(i,k) * B(k,j);
10879 (~C)(i,j) -= value * scalar;
10902 template<
typename MT3
10906 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
10907 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10909 selectLargeSubAssignKernel( C, A, B, scalar );
10914 #if BLAZE_BLAS_MODE
10928 template<
typename MT3
10932 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
10933 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10937 if( IsTriangular<MT4>::value ) {
10939 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10940 subAssign( C, tmp );
10942 else if( IsTriangular<MT5>::value ) {
10944 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10945 subAssign( C, tmp );
10948 gemm( C, A, B, ET(-scalar), ET(1) );
10981 template<
typename MT
10983 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
10984 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
10991 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
10992 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
10994 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
10997 else if( left.columns() == 0UL ) {
11012 smpAssign( ~lhs, A * B * rhs.scalar_ );
11031 template<
typename MT
11033 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11034 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11038 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
11050 const TmpType tmp( rhs );
11070 template<
typename MT
11072 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11073 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11080 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11081 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11083 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11120 template<
typename MT
11122 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11123 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11130 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11131 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11133 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11215 template<
typename T1
11217 inline const TDMatDMatMultExpr<T1,T2>
11241 template<
typename MT1,
typename MT2 >
11258 template<
typename MT1,
typename MT2 >
11275 template<
typename MT1,
typename MT2 >
11277 :
public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
11293 template<
typename MT1,
typename MT2 >
11295 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
11311 template<
typename MT1,
typename MT2 >
11313 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11329 template<
typename MT1,
typename MT2 >
11331 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11332 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11348 template<
typename MT1,
typename MT2 >
11350 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11366 template<
typename MT1,
typename MT2 >
11368 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11384 template<
typename MT1,
typename MT2 >
11386 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11387 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11403 template<
typename MT1,
typename MT2,
typename VT >
11408 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11409 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11410 IsDenseVector<VT>::value && IsColumnVector<VT>::value
11411 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
11412 , INVALID_TYPE >::Type Type;
11421 template<
typename MT1,
typename MT2,
typename VT >
11426 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11427 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11428 IsSparseVector<VT>::value && IsColumnVector<VT>::value
11429 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
11430 , INVALID_TYPE >::Type Type;
11439 template<
typename VT,
typename MT1,
typename MT2 >
11444 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
11445 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11446 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11447 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11448 , INVALID_TYPE >::Type Type;
11457 template<
typename VT,
typename MT1,
typename MT2 >
11462 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
11463 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11464 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11465 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11466 , INVALID_TYPE >::Type Type;
11475 template<
typename MT1,
typename MT2,
bool AF >
11480 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
11481 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
11490 template<
typename MT1,
typename MT2 >
11495 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
11504 template<
typename MT1,
typename MT2 >
11509 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:244
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:437
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:226
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:436
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:383
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:337
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:407
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:353
Constraint on the data type.
Header file for the IsComplexDouble type trait.
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:138
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:225
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:363
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:427
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:266
Header file for the Not class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:238
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:230
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:231
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:227
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:395
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:149
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:373
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:241
Constraints on the storage order of matrix types.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:232
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:235
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:417
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:150
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:281
Header file for the complex data type.
Header file for the IsUpper type trait.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:229
Header file for exception macros.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:228
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.