35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
131 template<
typename MT1
133 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
134 ,
private MatMatMultExpr
135 ,
private Computation
163 template<
typename T1,
typename T2,
typename T3 >
164 struct IsEvaluationRequired {
165 enum { value = ( evaluateLeft || evaluateRight ) };
175 template<
typename T1,
typename T2,
typename T3 >
176 struct UseSinglePrecisionKernel {
178 HasMutableDataAccess<T1>::value &&
179 HasConstDataAccess<T2>::value &&
180 HasConstDataAccess<T3>::value &&
181 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
182 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
183 IsFloat<typename T1::ElementType>::value &&
184 IsFloat<typename T2::ElementType>::value &&
185 IsFloat<typename T3::ElementType>::value };
195 template<
typename T1,
typename T2,
typename T3 >
196 struct UseDoublePrecisionKernel {
198 HasMutableDataAccess<T1>::value &&
199 HasConstDataAccess<T2>::value &&
200 HasConstDataAccess<T3>::value &&
201 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
202 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
203 IsDouble<typename T1::ElementType>::value &&
204 IsDouble<typename T2::ElementType>::value &&
205 IsDouble<typename T3::ElementType>::value };
216 template<
typename T1,
typename T2,
typename T3 >
217 struct UseSinglePrecisionComplexKernel {
218 typedef complex<float> Type;
220 HasMutableDataAccess<T1>::value &&
221 HasConstDataAccess<T2>::value &&
222 HasConstDataAccess<T3>::value &&
223 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
224 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
225 IsSame<typename T1::ElementType,Type>::value &&
226 IsSame<typename T2::ElementType,Type>::value &&
227 IsSame<typename T3::ElementType,Type>::value };
238 template<
typename T1,
typename T2,
typename T3 >
239 struct UseDoublePrecisionComplexKernel {
240 typedef complex<double> Type;
242 HasMutableDataAccess<T1>::value &&
243 HasConstDataAccess<T2>::value &&
244 HasConstDataAccess<T3>::value &&
245 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
246 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
247 IsSame<typename T1::ElementType,Type>::value &&
248 IsSame<typename T2::ElementType,Type>::value &&
249 IsSame<typename T3::ElementType,Type>::value };
259 template<
typename T1,
typename T2,
typename T3 >
260 struct UseDefaultKernel {
261 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
262 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
263 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
264 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
274 template<
typename T1,
typename T2,
typename T3 >
275 struct UseVectorizedDefaultKernel {
276 enum { value = !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
277 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
278 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
279 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
280 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
281 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
282 IntrinsicTrait<typename T1::ElementType>::addition &&
283 IntrinsicTrait<typename T1::ElementType>::subtraction &&
284 IntrinsicTrait<typename T1::ElementType>::multiplication };
316 MT1::vectorizable && MT2::vectorizable &&
322 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
323 !evaluateRight && MT2::smpAssignable };
366 :(
lhs_.columns() ) ) );
368 if(
lhs_.columns() == 0UL ||
378 const size_t knum( kend - kbegin );
379 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
381 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
383 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
385 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
411 return rhs_.columns();
441 template<
typename T >
443 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
453 template<
typename T >
455 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
465 return lhs_.isAligned() &&
rhs_.isAligned();
500 template<
typename MT
509 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
512 else if( rhs.lhs_.columns() == 0UL ) {
517 LT A(
serial( rhs.lhs_ ) );
518 RT B(
serial( rhs.rhs_ ) );
527 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
543 template<
typename MT3
546 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
550 selectSmallAssignKernel( C, A, B );
552 selectBlasAssignKernel( C, A, B );
571 template<
typename MT3
574 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
575 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
577 const size_t M( A.rows() );
578 const size_t N( B.columns() );
579 const size_t K( A.columns() );
581 for(
size_t i=0UL; i<M; ++i )
583 const size_t kbegin( ( IsUpper<MT4>::value )
584 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
586 const size_t kend( ( IsLower<MT4>::value )
587 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
591 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
592 for(
size_t j=0UL; j<N; ++j ) {
599 const size_t jbegin( ( IsUpper<MT5>::value )
600 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
602 const size_t jend( ( IsLower<MT5>::value )
603 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
607 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
608 for(
size_t j=0UL; j<jbegin; ++j ) {
612 else if( IsStrictlyUpper<MT5>::value ) {
613 reset( (~C)(i,0UL) );
615 for(
size_t j=jbegin; j<jend; ++j ) {
616 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
618 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
619 for(
size_t j=jend; j<N; ++j ) {
623 else if( IsStrictlyLower<MT5>::value ) {
624 reset( (~C)(i,N-1UL) );
628 for(
size_t k=kbegin+1UL; k<kend; ++k )
630 const size_t jbegin( ( IsUpper<MT5>::value )
631 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
633 const size_t jend( ( IsLower<MT5>::value )
634 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
638 for(
size_t j=jbegin; j<jend; ++j ) {
639 (~C)(i,j) += A(i,k) * B(k,j);
641 if( IsLower<MT5>::value ) {
642 (~C)(i,jend) = A(i,k) * B(k,jend);
664 template<
typename MT3
667 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
668 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
670 const size_t M( A.rows() );
671 const size_t N( B.columns() );
672 const size_t K( A.columns() );
674 for(
size_t j=0UL; j<N; ++j )
676 const size_t kbegin( ( IsLower<MT5>::value )
677 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
679 const size_t kend( ( IsUpper<MT5>::value )
680 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
684 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
685 for(
size_t i=0UL; i<M; ++i ) {
692 const size_t ibegin( ( IsLower<MT4>::value )
693 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
695 const size_t iend( ( IsUpper<MT4>::value )
696 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
700 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
701 for(
size_t i=0UL; i<ibegin; ++i ) {
705 else if( IsStrictlyLower<MT4>::value ) {
706 reset( (~C)(0UL,j) );
708 for(
size_t i=ibegin; i<iend; ++i ) {
709 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
711 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
712 for(
size_t i=iend; i<M; ++i ) {
716 else if( IsStrictlyUpper<MT4>::value ) {
717 reset( (~C)(M-1UL,j) );
721 for(
size_t k=kbegin+1UL; k<kend; ++k )
723 const size_t ibegin( ( IsLower<MT4>::value )
724 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
726 const size_t iend( ( IsUpper<MT4>::value )
727 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
731 for(
size_t i=ibegin; i<iend; ++i ) {
732 (~C)(i,j) += A(i,k) * B(k,j);
734 if( IsUpper<MT4>::value ) {
735 (~C)(iend,j) = A(iend,k) * B(k,j);
757 template<
typename MT3
760 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
761 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
763 const size_t M( A.rows() );
764 const size_t N( B.columns() );
766 const size_t block( 16UL );
768 for(
size_t ii=0UL; ii<M; ii+=block ) {
769 const size_t iend(
min( M, ii+block ) );
770 for(
size_t jj=0UL; jj<N; jj+=block ) {
771 const size_t jend(
min( N, jj+block ) );
772 for(
size_t i=ii; i<iend; ++i )
774 const size_t jbegin( ( IsUpper<MT4>::value )
775 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
777 const size_t jpos( ( IsLower<MT4>::value )
778 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
781 if( IsUpper<MT4>::value ) {
782 for(
size_t j=jj; j<jbegin; ++j ) {
786 for(
size_t j=jbegin; j<jpos; ++j ) {
787 (~C)(i,j) = A(i,j) * B(j,j);
789 if( IsLower<MT4>::value ) {
790 for(
size_t j=jpos; j<jend; ++j ) {
815 template<
typename MT3
818 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
819 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
821 const size_t M( A.rows() );
822 const size_t N( B.columns() );
824 for(
size_t j=0UL; j<N; ++j )
826 const size_t ibegin( ( IsLower<MT4>::value )
827 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
829 const size_t iend( ( IsUpper<MT4>::value )
830 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
834 if( IsLower<MT4>::value ) {
835 for(
size_t i=0UL; i<ibegin; ++i ) {
839 for(
size_t i=ibegin; i<iend; ++i ) {
840 (~C)(i,j) = A(i,j) * B(j,j);
842 if( IsUpper<MT4>::value ) {
843 for(
size_t i=iend; i<M; ++i ) {
866 template<
typename MT3
869 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
870 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
872 const size_t M( A.rows() );
873 const size_t N( B.columns() );
875 for(
size_t i=0UL; i<M; ++i )
877 const size_t jbegin( ( IsUpper<MT5>::value )
878 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
880 const size_t jend( ( IsLower<MT5>::value )
881 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
885 if( IsUpper<MT5>::value ) {
886 for(
size_t j=0UL; j<jbegin; ++j ) {
890 for(
size_t j=jbegin; j<jend; ++j ) {
891 (~C)(i,j) = A(i,i) * B(i,j);
893 if( IsLower<MT5>::value ) {
894 for(
size_t j=jend; j<N; ++j ) {
917 template<
typename MT3
920 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
921 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
923 const size_t M( A.rows() );
924 const size_t N( B.columns() );
926 const size_t block( 16UL );
928 for(
size_t jj=0UL; jj<N; jj+=block ) {
929 const size_t jend(
min( N, jj+block ) );
930 for(
size_t ii=0UL; ii<M; ii+=block ) {
931 const size_t iend(
min( M, ii+block ) );
932 for(
size_t j=jj; j<jend; ++j )
934 const size_t ibegin( ( IsLower<MT5>::value )
935 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
937 const size_t ipos( ( IsUpper<MT5>::value )
938 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
941 if( IsLower<MT5>::value ) {
942 for(
size_t i=ii; i<ibegin; ++i ) {
946 for(
size_t i=ibegin; i<ipos; ++i ) {
947 (~C)(i,j) = A(i,i) * B(i,j);
949 if( IsUpper<MT5>::value ) {
950 for(
size_t i=ipos; i<iend; ++i ) {
975 template<
typename MT3
978 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
979 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
983 for(
size_t i=0UL; i<A.rows(); ++i ) {
984 C(i,i) = A(i,i) * B(i,i);
1004 template<
typename MT3
1007 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1008 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1010 selectDefaultAssignKernel( ~C, A, B );
1030 template<
typename MT3
1033 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1034 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1036 typedef IntrinsicTrait<ElementType> IT;
1038 const size_t M( A.rows() );
1039 const size_t N( B.columns() );
1040 const size_t K( A.columns() );
1045 for(
size_t i=0UL; i<M; ++i )
1047 const size_t kbegin( ( IsUpper<MT4>::value )
1048 ?( ( IsLower<MT5>::value )
1049 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1050 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1051 :( IsLower<MT5>::value ? j : 0UL ) );
1052 const size_t kend( ( IsLower<MT4>::value )
1053 ?( ( IsUpper<MT5>::value )
1054 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
1055 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1056 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
1058 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1060 for(
size_t k=kbegin; k<kend; ++k ) {
1061 const IntrinsicType a1(
set( A(i,k) ) );
1062 xmm1 = xmm1 + a1 * B.load(k,j );
1063 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1064 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1065 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1066 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
1067 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
1068 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
1069 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
1072 (~C).
store( i, j , xmm1 );
1087 for( ; (i+2UL) <= M; i+=2UL )
1089 const size_t kbegin( ( IsUpper<MT4>::value )
1090 ?( ( IsLower<MT5>::value )
1091 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1092 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1093 :( IsLower<MT5>::value ? j : 0UL ) );
1094 const size_t kend( ( IsLower<MT4>::value )
1095 ?( ( IsUpper<MT5>::value )
1096 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
1097 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1098 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
1100 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1102 for(
size_t k=kbegin; k<kend; ++k ) {
1103 const IntrinsicType a1(
set( A(i ,k) ) );
1104 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1105 const IntrinsicType b1( B.load(k,j ) );
1106 const IntrinsicType b2( B.load(k,j+
IT::size ) );
1107 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
1108 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
1109 xmm1 = xmm1 + a1 * b1;
1110 xmm2 = xmm2 + a1 * b2;
1111 xmm3 = xmm3 + a1 * b3;
1112 xmm4 = xmm4 + a1 * b4;
1113 xmm5 = xmm5 + a2 * b1;
1114 xmm6 = xmm6 + a2 * b2;
1115 xmm7 = xmm7 + a2 * b3;
1116 xmm8 = xmm8 + a2 * b4;
1119 (~C).
store( i , j , xmm1 );
1123 (~C).
store( i+1UL, j , xmm5 );
1131 const size_t kbegin( ( IsUpper<MT4>::value )
1132 ?( ( IsLower<MT5>::value )
1133 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1134 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1135 :( IsLower<MT5>::value ? j : 0UL ) );
1136 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
1138 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1140 for(
size_t k=kbegin; k<kend; ++k ) {
1141 const IntrinsicType a1(
set( A(i,k) ) );
1142 xmm1 = xmm1 + a1 * B.load(k,j );
1143 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1144 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1145 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1148 (~C).
store( i, j , xmm1 );
1159 for( ; (i+2UL) <= M; i+=2UL )
1161 const size_t kbegin( ( IsUpper<MT4>::value )
1162 ?( ( IsLower<MT5>::value )
1163 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1164 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1165 :( IsLower<MT5>::value ? j : 0UL ) );
1166 const size_t kend( ( IsLower<MT4>::value )
1167 ?( ( IsUpper<MT5>::value )
1168 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
1169 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1170 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
1172 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1174 for(
size_t k=kbegin; k<kend; ++k ) {
1175 const IntrinsicType a1(
set( A(i ,k) ) );
1176 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1177 const IntrinsicType b1( B.load(k,j ) );
1178 const IntrinsicType b2( B.load(k,j+
IT::size) );
1179 xmm1 = xmm1 + a1 * b1;
1180 xmm2 = xmm2 + a1 * b2;
1181 xmm3 = xmm3 + a2 * b1;
1182 xmm4 = xmm4 + a2 * b2;
1185 (~C).
store( i , j , xmm1 );
1187 (~C).
store( i+1UL, j , xmm3 );
1193 const size_t kbegin( ( IsUpper<MT4>::value )
1194 ?( ( IsLower<MT5>::value )
1195 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1196 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1197 :( IsLower<MT5>::value ? j : 0UL ) );
1198 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
1200 IntrinsicType xmm1, xmm2;
1202 for(
size_t k=kbegin; k<kend; ++k ) {
1203 const IntrinsicType a1(
set( A(i,k) ) );
1204 xmm1 = xmm1 + a1 * B.load(k,j );
1205 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
1208 (~C).
store( i, j , xmm1 );
1217 for( ; (i+2UL) <= M; i+=2UL )
1219 const size_t kbegin( ( IsUpper<MT4>::value )
1220 ?( ( IsLower<MT5>::value )
1221 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1222 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1223 :( IsLower<MT5>::value ? j : 0UL ) );
1224 const size_t kend( ( IsLower<MT4>::value )
1225 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1228 IntrinsicType xmm1, xmm2;
1230 for(
size_t k=kbegin; k<kend; ++k ) {
1231 const IntrinsicType b1( B.load(k,j) );
1232 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1233 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1236 (~C).
store( i , j, xmm1 );
1237 (~C).
store( i+1UL, j, xmm2 );
1242 const size_t kbegin( ( IsUpper<MT4>::value )
1243 ?( ( IsLower<MT5>::value )
1244 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1245 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1246 :( IsLower<MT5>::value ? j : 0UL ) );
1250 for(
size_t k=kbegin; k<K; ++k ) {
1251 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1254 (~C).
store( i, j, xmm1 );
1276 template<
typename MT3
1279 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1280 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1282 typedef IntrinsicTrait<ElementType> IT;
1284 const size_t M( A.rows() );
1285 const size_t N( B.columns() );
1286 const size_t K( A.columns() );
1291 for(
size_t j=0UL; j<N; ++j )
1293 const size_t kbegin( ( IsLower<MT5>::value )
1294 ?( ( IsUpper<MT4>::value )
1295 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1296 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1297 :( IsUpper<MT4>::value ? i : 0UL ) );
1298 const size_t kend( ( IsUpper<MT5>::value )
1299 ?( ( IsLower<MT4>::value )
1300 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1301 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1302 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
1304 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1306 for(
size_t k=kbegin; k<kend; ++k ) {
1307 const IntrinsicType b1(
set( B(k,j) ) );
1308 xmm1 = xmm1 + A.load(i ,k) * b1;
1309 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1310 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1311 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1312 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
1313 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
1314 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
1315 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
1318 (~C).
store( i , j, xmm1 );
1333 for( ; (j+2UL) <= N; j+=2UL )
1335 const size_t kbegin( ( IsLower<MT5>::value )
1336 ?( ( IsUpper<MT4>::value )
1337 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1338 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1339 :( IsUpper<MT4>::value ? i : 0UL ) );
1340 const size_t kend( ( IsUpper<MT5>::value )
1341 ?( ( IsLower<MT4>::value )
1342 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1343 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1344 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
1346 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1348 for(
size_t k=kbegin; k<kend; ++k ) {
1349 const IntrinsicType a1( A.load(i ,k) );
1350 const IntrinsicType a2( A.load(i+
IT::size ,k) );
1351 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
1352 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
1353 const IntrinsicType b1(
set( B(k,j ) ) );
1354 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1355 xmm1 = xmm1 + a1 * b1;
1356 xmm2 = xmm2 + a2 * b1;
1357 xmm3 = xmm3 + a3 * b1;
1358 xmm4 = xmm4 + a4 * b1;
1359 xmm5 = xmm5 + a1 * b2;
1360 xmm6 = xmm6 + a2 * b2;
1361 xmm7 = xmm7 + a3 * b2;
1362 xmm8 = xmm8 + a4 * b2;
1365 (~C).
store( i , j , xmm1 );
1369 (~C).
store( i , j+1UL, xmm5 );
1377 const size_t kbegin( ( IsLower<MT5>::value )
1378 ?( ( IsUpper<MT4>::value )
1379 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1380 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1381 :( IsUpper<MT4>::value ? i : 0UL ) );
1382 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
1384 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1386 for(
size_t k=kbegin; k<kend; ++k ) {
1387 const IntrinsicType b1(
set( B(k,j) ) );
1388 xmm1 = xmm1 + A.load(i ,k) * b1;
1389 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1390 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1391 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1394 (~C).
store( i , j, xmm1 );
1405 for( ; (j+2UL) <= N; j+=2UL )
1407 const size_t kbegin( ( IsLower<MT5>::value )
1408 ?( ( IsUpper<MT4>::value )
1409 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1410 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1411 :( IsUpper<MT4>::value ? i : 0UL ) );
1412 const size_t kend( ( IsUpper<MT5>::value )
1413 ?( ( IsLower<MT4>::value )
1414 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1415 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1416 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
1418 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1420 for(
size_t k=kbegin; k<kend; ++k ) {
1421 const IntrinsicType a1( A.load(i ,k) );
1422 const IntrinsicType a2( A.load(i+
IT::size,k) );
1423 const IntrinsicType b1(
set( B(k,j ) ) );
1424 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1425 xmm1 = xmm1 + a1 * b1;
1426 xmm2 = xmm2 + a2 * b1;
1427 xmm3 = xmm3 + a1 * b2;
1428 xmm4 = xmm4 + a2 * b2;
1431 (~C).
store( i , j , xmm1 );
1433 (~C).
store( i , j+1UL, xmm3 );
1439 const size_t kbegin( ( IsLower<MT5>::value )
1440 ?( ( IsUpper<MT4>::value )
1441 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1442 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1443 :( IsUpper<MT4>::value ? i : 0UL ) );
1444 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
1446 IntrinsicType xmm1, xmm2;
1448 for(
size_t k=kbegin; k<kend; ++k ) {
1449 const IntrinsicType b1(
set( B(k,j) ) );
1450 xmm1 = xmm1 + A.load(i ,k) * b1;
1451 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1454 (~C).
store( i , j, xmm1 );
1463 for( ; (j+2UL) <= N; j+=2UL )
1465 const size_t kbegin( ( IsLower<MT5>::value )
1466 ?( ( IsUpper<MT4>::value )
1467 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1468 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1469 :( IsUpper<MT4>::value ? i : 0UL ) );
1470 const size_t kend( ( IsUpper<MT5>::value )
1471 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1474 IntrinsicType xmm1, xmm2;
1476 for(
size_t k=kbegin; k<kend; ++k ) {
1477 const IntrinsicType a1( A.load(i,k) );
1478 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1479 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1482 (~C).
store( i, j , xmm1 );
1483 (~C).
store( i, j+1UL, xmm2 );
1488 const size_t kbegin( ( IsLower<MT5>::value )
1489 ?( ( IsUpper<MT4>::value )
1490 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1491 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1492 :( IsUpper<MT4>::value ? i : 0UL ) );
1496 for(
size_t k=kbegin; k<K; ++k ) {
1497 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1500 (~C).
store( i, j, xmm1 );
1521 template<
typename MT3
1524 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1525 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1527 selectDefaultAssignKernel( C, A, B );
1547 template<
typename MT3
1550 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1551 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1553 typedef IntrinsicTrait<ElementType> IT;
1555 const size_t M( A.rows() );
1556 const size_t N( B.columns() );
1557 const size_t K( A.columns() );
1559 const size_t iblock( 64UL );
1560 const size_t jblock( 128UL );
1561 const size_t kblock( 128UL );
1563 for(
size_t jj=0UL; jj<N; jj+=jblock )
1565 const size_t jend(
min( jj+jblock, N ) );
1567 for(
size_t ii=0UL; ii<M; ii+=iblock )
1569 const size_t iend(
min( ii+iblock, M ) );
1571 for(
size_t i=ii; i<iend; ++i ) {
1572 for(
size_t j=jj; j<jend; ++j ) {
1577 for(
size_t kk=0UL; kk<K; kk+=kblock )
1579 const size_t ktmp(
min( kk+kblock, K ) );
1591 for( ; (i+2UL) <= iend; i+=2UL )
1593 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1594 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1595 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1596 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1598 IntrinsicType xmm1( (~C).
load(i ,j ) );
1599 IntrinsicType xmm2( (~C).
load(i ,j1) );
1600 IntrinsicType xmm3( (~C).
load(i ,j2) );
1601 IntrinsicType xmm4( (~C).
load(i ,j3) );
1602 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
1603 IntrinsicType xmm6( (~C).
load(i+1UL,j1) );
1604 IntrinsicType xmm7( (~C).
load(i+1UL,j2) );
1605 IntrinsicType xmm8( (~C).
load(i+1UL,j3) );
1607 for(
size_t k=kbegin; k<kend; ++k ) {
1608 const IntrinsicType a1(
set( A(i ,k) ) );
1609 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1610 const IntrinsicType b1( B.load(k,j ) );
1611 const IntrinsicType b2( B.load(k,j1) );
1612 const IntrinsicType b3( B.load(k,j2) );
1613 const IntrinsicType b4( B.load(k,j3) );
1614 xmm1 = xmm1 + a1 * b1;
1615 xmm2 = xmm2 + a1 * b2;
1616 xmm3 = xmm3 + a1 * b3;
1617 xmm4 = xmm4 + a1 * b4;
1618 xmm5 = xmm5 + a2 * b1;
1619 xmm6 = xmm6 + a2 * b2;
1620 xmm7 = xmm7 + a2 * b3;
1621 xmm8 = xmm8 + a2 * b4;
1624 (~C).
store( i , j , xmm1 );
1625 (~C).
store( i , j1, xmm2 );
1626 (~C).
store( i , j2, xmm3 );
1627 (~C).
store( i , j3, xmm4 );
1628 (~C).
store( i+1UL, j , xmm5 );
1629 (~C).
store( i+1UL, j1, xmm6 );
1630 (~C).
store( i+1UL, j2, xmm7 );
1631 (~C).
store( i+1UL, j3, xmm8 );
1636 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1637 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1638 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1639 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1641 IntrinsicType xmm1( (~C).
load(i,j ) );
1642 IntrinsicType xmm2( (~C).
load(i,j1) );
1643 IntrinsicType xmm3( (~C).
load(i,j2) );
1644 IntrinsicType xmm4( (~C).
load(i,j3) );
1646 for(
size_t k=kbegin; k<kend; ++k ) {
1647 const IntrinsicType a1(
set( A(i,k) ) );
1648 xmm1 = xmm1 + a1 * B.load(k,j );
1649 xmm2 = xmm2 + a1 * B.load(k,j1);
1650 xmm3 = xmm3 + a1 * B.load(k,j2);
1651 xmm4 = xmm4 + a1 * B.load(k,j3);
1654 (~C).
store( i, j , xmm1 );
1655 (~C).
store( i, j1, xmm2 );
1656 (~C).
store( i, j2, xmm3 );
1657 (~C).
store( i, j3, xmm4 );
1667 for( ; (i+4UL) <= iend; i+=4UL )
1669 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1670 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1671 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1672 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1674 IntrinsicType xmm1( (~C).
load(i ,j ) );
1675 IntrinsicType xmm2( (~C).
load(i ,j1) );
1676 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
1677 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
1678 IntrinsicType xmm5( (~C).
load(i+2UL,j ) );
1679 IntrinsicType xmm6( (~C).
load(i+2UL,j1) );
1680 IntrinsicType xmm7( (~C).
load(i+3UL,j ) );
1681 IntrinsicType xmm8( (~C).
load(i+3UL,j1) );
1683 for(
size_t k=kbegin; k<kend; ++k ) {
1684 const IntrinsicType a1(
set( A(i ,k) ) );
1685 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1686 const IntrinsicType a3(
set( A(i+2UL,k) ) );
1687 const IntrinsicType a4(
set( A(i+3UL,k) ) );
1688 const IntrinsicType b1( B.load(k,j ) );
1689 const IntrinsicType b2( B.load(k,j1) );
1690 xmm1 = xmm1 + a1 * b1;
1691 xmm2 = xmm2 + a1 * b2;
1692 xmm3 = xmm3 + a2 * b1;
1693 xmm4 = xmm4 + a2 * b2;
1694 xmm5 = xmm5 + a3 * b1;
1695 xmm6 = xmm6 + a3 * b2;
1696 xmm7 = xmm7 + a4 * b1;
1697 xmm8 = xmm8 + a4 * b2;
1700 (~C).
store( i , j , xmm1 );
1701 (~C).
store( i , j1, xmm2 );
1702 (~C).
store( i+1UL, j , xmm3 );
1703 (~C).
store( i+1UL, j1, xmm4 );
1704 (~C).
store( i+2UL, j , xmm5 );
1705 (~C).
store( i+2UL, j1, xmm6 );
1706 (~C).
store( i+3UL, j , xmm7 );
1707 (~C).
store( i+3UL, j1, xmm8 );
1710 for( ; (i+2UL) <= iend; i+=2UL )
1712 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1713 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1714 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1715 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1717 IntrinsicType xmm1( (~C).
load(i ,j ) );
1718 IntrinsicType xmm2( (~C).
load(i ,j1) );
1719 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
1720 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
1722 for(
size_t k=kbegin; k<kend; ++k ) {
1723 const IntrinsicType a1(
set( A(i ,k) ) );
1724 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1725 const IntrinsicType b1( B.load(k,j ) );
1726 const IntrinsicType b2( B.load(k,j1) );
1727 xmm1 = xmm1 + a1 * b1;
1728 xmm2 = xmm2 + a1 * b2;
1729 xmm3 = xmm3 + a2 * b1;
1730 xmm4 = xmm4 + a2 * b2;
1733 (~C).
store( i , j , xmm1 );
1734 (~C).
store( i , j1, xmm2 );
1735 (~C).
store( i+1UL, j , xmm3 );
1736 (~C).
store( i+1UL, j1, xmm4 );
1741 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1742 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1743 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1744 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1746 IntrinsicType xmm1( (~C).
load(i,j ) );
1747 IntrinsicType xmm2( (~C).
load(i,j1) );
1749 for(
size_t k=kbegin; k<kend; ++k ) {
1750 const IntrinsicType a1(
set( A(i,k) ) );
1751 xmm1 = xmm1 + a1 * B.load(k,j );
1752 xmm2 = xmm2 + a1 * B.load(k,j1);
1755 (~C).
store( i, j , xmm1 );
1756 (~C).
store( i, j1, xmm2 );
1762 for(
size_t i=ii; i<iend; ++i )
1764 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1765 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1766 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1767 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
1769 IntrinsicType xmm1( (~C).
load(i,j) );
1771 for(
size_t k=kbegin; k<kend; ++k ) {
1772 const IntrinsicType a1(
set( A(i,k) ) );
1773 xmm1 = xmm1 + a1 * B.load(k,j);
1776 (~C).
store( i, j, xmm1 );
1801 template<
typename MT3
1804 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1805 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1807 typedef IntrinsicTrait<ElementType> IT;
1809 const size_t M( A.rows() );
1810 const size_t N( B.columns() );
1811 const size_t K( A.columns() );
1813 const size_t iblock( 128UL );
1814 const size_t jblock( 64UL );
1815 const size_t kblock( 128UL );
1817 for(
size_t ii=0UL; ii<M; ii+=iblock )
1819 const size_t iend(
min( ii+iblock, M ) );
1821 for(
size_t jj=0UL; jj<N; jj+=jblock )
1823 const size_t jend(
min( jj+jblock, N ) );
1825 for(
size_t j=jj; j<jend; ++j ) {
1826 for(
size_t i=ii; i<iend; ++i ) {
1831 for(
size_t kk=0UL; kk<K; kk+=kblock )
1833 const size_t ktmp(
min( kk+kblock, K ) );
1845 for( ; (j+2UL) <= jend; j+=2UL )
1847 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1848 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1849 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1850 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1852 IntrinsicType xmm1( (~C).
load(i ,j ) );
1853 IntrinsicType xmm2( (~C).
load(i1,j ) );
1854 IntrinsicType xmm3( (~C).
load(i2,j ) );
1855 IntrinsicType xmm4( (~C).
load(i3,j ) );
1856 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
1857 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
1858 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
1859 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
1861 for(
size_t k=kbegin; k<kend; ++k ) {
1862 const IntrinsicType a1( A.load(i ,k) );
1863 const IntrinsicType a2( A.load(i1,k) );
1864 const IntrinsicType a3( A.load(i2,k) );
1865 const IntrinsicType a4( A.load(i3,k) );
1866 const IntrinsicType b1(
set( B(k,j ) ) );
1867 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1868 xmm1 = xmm1 + a1 * b1;
1869 xmm2 = xmm2 + a2 * b1;
1870 xmm3 = xmm3 + a3 * b1;
1871 xmm4 = xmm4 + a4 * b1;
1872 xmm5 = xmm5 + a1 * b2;
1873 xmm6 = xmm6 + a2 * b2;
1874 xmm7 = xmm7 + a3 * b2;
1875 xmm8 = xmm8 + a4 * b2;
1878 (~C).
store( i , j , xmm1 );
1879 (~C).
store( i1, j , xmm2 );
1880 (~C).
store( i2, j , xmm3 );
1881 (~C).
store( i3, j , xmm4 );
1882 (~C).
store( i , j+1UL, xmm5 );
1883 (~C).
store( i1, j+1UL, xmm6 );
1884 (~C).
store( i2, j+1UL, xmm7 );
1885 (~C).
store( i3, j+1UL, xmm8 );
1890 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1891 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1892 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1893 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1895 IntrinsicType xmm1( (~C).
load(i ,j) );
1896 IntrinsicType xmm2( (~C).
load(i1,j) );
1897 IntrinsicType xmm3( (~C).
load(i2,j) );
1898 IntrinsicType xmm4( (~C).
load(i3,j) );
1900 for(
size_t k=kbegin; k<kend; ++k ) {
1901 const IntrinsicType b1(
set( B(k,j) ) );
1902 xmm1 = xmm1 + A.load(i ,k) * b1;
1903 xmm2 = xmm2 + A.load(i1,k) * b1;
1904 xmm3 = xmm3 + A.load(i2,k) * b1;
1905 xmm4 = xmm4 + A.load(i3,k) * b1;
1908 (~C).
store( i , j, xmm1 );
1909 (~C).
store( i1, j, xmm2 );
1910 (~C).
store( i2, j, xmm3 );
1911 (~C).
store( i3, j, xmm4 );
1921 for( ; (j+4UL) <= jend; j+=4UL )
1923 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1924 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1925 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1926 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1928 IntrinsicType xmm1( (~C).
load(i ,j ) );
1929 IntrinsicType xmm2( (~C).
load(i1,j ) );
1930 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
1931 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
1932 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
1933 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
1934 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
1935 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
1937 for(
size_t k=kbegin; k<kend; ++k ) {
1938 const IntrinsicType a1( A.load(i ,k) );
1939 const IntrinsicType a2( A.load(i1,k) );
1940 const IntrinsicType b1(
set( B(k,j ) ) );
1941 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1942 const IntrinsicType b3(
set( B(k,j+2UL) ) );
1943 const IntrinsicType b4(
set( B(k,j+3UL) ) );
1944 xmm1 = xmm1 + a1 * b1;
1945 xmm2 = xmm2 + a2 * b1;
1946 xmm3 = xmm3 + a1 * b2;
1947 xmm4 = xmm4 + a2 * b2;
1948 xmm5 = xmm5 + a1 * b3;
1949 xmm6 = xmm6 + a2 * b3;
1950 xmm7 = xmm7 + a1 * b4;
1951 xmm8 = xmm8 + a2 * b4;
1954 (~C).
store( i , j , xmm1 );
1955 (~C).
store( i1, j , xmm2 );
1956 (~C).
store( i , j+1UL, xmm3 );
1957 (~C).
store( i1, j+1UL, xmm4 );
1958 (~C).
store( i , j+2UL, xmm5 );
1959 (~C).
store( i1, j+2UL, xmm6 );
1960 (~C).
store( i , j+3UL, xmm7 );
1961 (~C).
store( i1, j+3UL, xmm8 );
1964 for( ; (j+2UL) <= jend; j+=2UL )
1966 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1967 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1968 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1969 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1971 IntrinsicType xmm1( (~C).
load(i ,j ) );
1972 IntrinsicType xmm2( (~C).
load(i1,j ) );
1973 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
1974 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
1976 for(
size_t k=kbegin; k<kend; ++k ) {
1977 const IntrinsicType a1( A.load(i ,k) );
1978 const IntrinsicType a2( A.load(i1,k) );
1979 const IntrinsicType b1(
set( B(k,j ) ) );
1980 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1981 xmm1 = xmm1 + a1 * b1;
1982 xmm2 = xmm2 + a2 * b1;
1983 xmm3 = xmm3 + a1 * b2;
1984 xmm4 = xmm4 + a2 * b2;
1987 (~C).
store( i , j , xmm1 );
1988 (~C).
store( i1, j , xmm2 );
1989 (~C).
store( i , j+1UL, xmm3 );
1990 (~C).
store( i1, j+1UL, xmm4 );
1995 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1996 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1997 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1998 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2000 IntrinsicType xmm1( (~C).
load(i ,j) );
2001 IntrinsicType xmm2( (~C).
load(i1,j) );
2003 for(
size_t k=kbegin; k<kend; ++k ) {
2004 const IntrinsicType b1(
set( B(k,j) ) );
2005 xmm1 = xmm1 + A.load(i ,k) * b1;
2006 xmm2 = xmm2 + A.load(i1,k) * b1;
2009 (~C).
store( i , j, xmm1 );
2010 (~C).
store( i1, j, xmm2 );
2016 for(
size_t j=jj; j<jend; ++j )
2018 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2019 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2020 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
2021 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2023 IntrinsicType xmm1( (~C).
load(i,j) );
2025 for(
size_t k=kbegin; k<kend; ++k ) {
2026 const IntrinsicType b1(
set( B(k,j) ) );
2027 xmm1 = xmm1 + A.load(i,k) * b1;
2030 (~C).
store( i, j, xmm1 );
2054 template<
typename MT3
2057 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
2058 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2060 selectLargeAssignKernel( C, A, B );
2080 template<
typename MT3
2083 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
2084 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2086 if( IsTriangular<MT4>::value ) {
2088 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2090 else if( IsTriangular<MT5>::value ) {
2092 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
2095 sgemm( C, A, B, 1.0F, 0.0F );
2117 template<
typename MT3
2120 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
2121 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2123 if( IsTriangular<MT4>::value ) {
2125 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2127 else if( IsTriangular<MT5>::value ) {
2129 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
2132 dgemm( C, A, B, 1.0, 0.0 );
2154 template<
typename MT3
2157 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2158 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2160 if( IsTriangular<MT4>::value ) {
2162 ctrmm( C, A, CblasLeft,
2163 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2164 complex<float>( 1.0F, 0.0F ) );
2166 else if( IsTriangular<MT5>::value ) {
2168 ctrmm( C, B, CblasRight,
2169 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2170 complex<float>( 1.0F, 0.0F ) );
2173 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
2195 template<
typename MT3
2198 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
2199 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2201 if( IsTriangular<MT4>::value ) {
2203 ztrmm( C, A, CblasLeft,
2204 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
2205 complex<double>( 1.0, 0.0 ) );
2207 else if( IsTriangular<MT5>::value ) {
2209 ztrmm( C, B, CblasRight,
2210 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
2211 complex<double>( 1.0, 0.0 ) );
2214 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
2234 template<
typename MT
2240 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
2252 const TmpType tmp(
serial( rhs ) );
2271 template<
typename MT
2280 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2284 LT A(
serial( rhs.lhs_ ) );
2285 RT B(
serial( rhs.rhs_ ) );
2294 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2310 template<
typename MT3
2313 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2315 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2317 selectSmallAddAssignKernel( C, A, B );
2319 selectBlasAddAssignKernel( C, A, B );
2338 template<
typename MT3
2341 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2342 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2344 const size_t M( A.rows() );
2345 const size_t N( B.columns() );
2346 const size_t K( A.columns() );
2348 for(
size_t i=0UL; i<M; ++i )
2350 const size_t kbegin( ( IsUpper<MT4>::value )
2351 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2353 const size_t kend( ( IsLower<MT4>::value )
2354 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2358 for(
size_t k=kbegin; k<kend; ++k )
2360 const size_t jbegin( ( IsUpper<MT5>::value )
2361 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2363 const size_t jend( ( IsLower<MT5>::value )
2364 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2368 const size_t jnum( jend - jbegin );
2369 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2371 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2372 (~C)(i,j ) += A(i,k) * B(k,j );
2373 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2376 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2398 template<
typename MT3
2401 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2402 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2404 const size_t M( A.rows() );
2405 const size_t N( B.columns() );
2406 const size_t K( A.columns() );
2408 for(
size_t j=0UL; j<N; ++j )
2410 const size_t kbegin( ( IsLower<MT5>::value )
2411 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2413 const size_t kend( ( IsUpper<MT5>::value )
2414 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2418 for(
size_t k=kbegin; k<kend; ++k )
2420 const size_t ibegin( ( IsLower<MT4>::value )
2421 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2423 const size_t iend( ( IsUpper<MT4>::value )
2424 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2428 const size_t inum( iend - ibegin );
2429 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2431 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2432 (~C)(i ,j) += A(i ,k) * B(k,j);
2433 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2436 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2458 template<
typename MT3
2461 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2462 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2464 const size_t M( A.rows() );
2465 const size_t N( B.columns() );
2467 const size_t block( 16UL );
2469 for(
size_t ii=0UL; ii<M; ii+=block ) {
2470 const size_t iend(
min( M, ii+block ) );
2471 for(
size_t jj=0UL; jj<N; jj+=block ) {
2472 const size_t jend(
min( N, jj+block ) );
2473 for(
size_t i=ii; i<iend; ++i )
2475 const size_t jbegin( ( IsUpper<MT4>::value )
2476 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2478 const size_t jpos( ( IsLower<MT4>::value )
2479 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2482 for(
size_t j=jbegin; j<jpos; ++j ) {
2483 (~C)(i,j) += A(i,j) * B(j,j);
2506 template<
typename MT3
2509 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2510 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2512 const size_t M( A.rows() );
2513 const size_t N( B.columns() );
2515 for(
size_t j=0UL; j<N; ++j )
2517 const size_t ibegin( ( IsLower<MT4>::value )
2518 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2520 const size_t iend( ( IsUpper<MT4>::value )
2521 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2525 const size_t inum( iend - ibegin );
2526 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2528 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2529 (~C)(i ,j) += A(i ,j) * B(j,j);
2530 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2533 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2554 template<
typename MT3
2557 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2558 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2560 const size_t M( A.rows() );
2561 const size_t N( B.columns() );
2563 for(
size_t i=0UL; i<M; ++i )
2565 const size_t jbegin( ( IsUpper<MT5>::value )
2566 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2568 const size_t jend( ( IsLower<MT5>::value )
2569 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2573 const size_t jnum( jend - jbegin );
2574 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2576 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2577 (~C)(i,j ) += A(i,i) * B(i,j );
2578 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2581 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2602 template<
typename MT3
2605 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2606 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2608 const size_t M( A.rows() );
2609 const size_t N( B.columns() );
2611 const size_t block( 16UL );
2613 for(
size_t jj=0UL; jj<N; jj+=block ) {
2614 const size_t jend(
min( N, jj+block ) );
2615 for(
size_t ii=0UL; ii<M; ii+=block ) {
2616 const size_t iend(
min( M, ii+block ) );
2617 for(
size_t j=jj; j<jend; ++j )
2619 const size_t ibegin( ( IsLower<MT5>::value )
2620 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2622 const size_t ipos( ( IsUpper<MT5>::value )
2623 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2626 for(
size_t i=ibegin; i<ipos; ++i ) {
2627 (~C)(i,j) += A(i,i) * B(i,j);
2650 template<
typename MT3
2653 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2654 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2656 for(
size_t i=0UL; i<A.rows(); ++i ) {
2657 C(i,i) += A(i,i) * B(i,i);
2677 template<
typename MT3
2680 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2681 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2683 selectDefaultAddAssignKernel( C, A, B );
2703 template<
typename MT3
2706 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2707 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2709 typedef IntrinsicTrait<ElementType> IT;
2711 const size_t M( A.rows() );
2712 const size_t N( B.columns() );
2713 const size_t K( A.columns() );
2718 for(
size_t i=0UL; i<M; ++i )
2720 const size_t kbegin( ( IsUpper<MT4>::value )
2721 ?( ( IsLower<MT5>::value )
2722 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2723 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2724 :( IsLower<MT5>::value ? j : 0UL ) );
2725 const size_t kend( ( IsLower<MT4>::value )
2726 ?( ( IsUpper<MT5>::value )
2727 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
2728 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2729 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
2731 IntrinsicType xmm1( (~C).
load(i,j ) );
2740 for(
size_t k=kbegin; k<kend; ++k ) {
2741 const IntrinsicType a1(
set( A(i,k) ) );
2742 xmm1 = xmm1 + a1 * B.load(k,j );
2743 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
2744 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
2745 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
2746 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
2747 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
2748 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
2749 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
2752 (~C).
store( i, j , xmm1 );
2767 for( ; (i+2UL) <= M; i+=2UL )
2769 const size_t kbegin( ( IsUpper<MT4>::value )
2770 ?( ( IsLower<MT5>::value )
2771 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2772 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2773 :( IsLower<MT5>::value ? j : 0UL ) );
2774 const size_t kend( ( IsLower<MT4>::value )
2775 ?( ( IsUpper<MT5>::value )
2776 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
2777 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2778 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
2780 IntrinsicType xmm1( (~C).
load(i ,j ) );
2784 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
2789 for(
size_t k=kbegin; k<kend; ++k ) {
2790 const IntrinsicType a1(
set( A(i ,k) ) );
2791 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2792 const IntrinsicType b1( B.load(k,j ) );
2793 const IntrinsicType b2( B.load(k,j+
IT::size ) );
2794 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
2795 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
2796 xmm1 = xmm1 + a1 * b1;
2797 xmm2 = xmm2 + a1 * b2;
2798 xmm3 = xmm3 + a1 * b3;
2799 xmm4 = xmm4 + a1 * b4;
2800 xmm5 = xmm5 + a2 * b1;
2801 xmm6 = xmm6 + a2 * b2;
2802 xmm7 = xmm7 + a2 * b3;
2803 xmm8 = xmm8 + a2 * b4;
2806 (~C).
store( i , j , xmm1 );
2810 (~C).
store( i+1UL, j , xmm5 );
2818 const size_t kbegin( ( IsUpper<MT4>::value )
2819 ?( ( IsLower<MT5>::value )
2820 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2821 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2822 :( IsLower<MT5>::value ? j : 0UL ) );
2823 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
2825 IntrinsicType xmm1( (~C).
load(i,j ) );
2830 for(
size_t k=kbegin; k<kend; ++k ) {
2831 const IntrinsicType a1(
set( A(i,k) ) );
2832 xmm1 = xmm1 + a1 * B.load(k,j );
2833 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
2834 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
2835 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
2838 (~C).
store( i, j , xmm1 );
2849 for( ; (i+2UL) <= M; i+=2UL )
2851 const size_t kbegin( ( IsUpper<MT4>::value )
2852 ?( ( IsLower<MT5>::value )
2853 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2854 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2855 :( IsLower<MT5>::value ? j : 0UL ) );
2856 const size_t kend( ( IsLower<MT4>::value )
2857 ?( ( IsUpper<MT5>::value )
2858 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
2859 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2860 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
2862 IntrinsicType xmm1( (~C).
load(i ,j ) );
2864 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
2867 for(
size_t k=kbegin; k<kend; ++k ) {
2868 const IntrinsicType a1(
set( A(i ,k) ) );
2869 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2870 const IntrinsicType b1( B.load(k,j ) );
2871 const IntrinsicType b2( B.load(k,j+
IT::size) );
2872 xmm1 = xmm1 + a1 * b1;
2873 xmm2 = xmm2 + a1 * b2;
2874 xmm3 = xmm3 + a2 * b1;
2875 xmm4 = xmm4 + a2 * b2;
2878 (~C).
store( i , j , xmm1 );
2880 (~C).
store( i+1UL, j , xmm3 );
2886 const size_t kbegin( ( IsUpper<MT4>::value )
2887 ?( ( IsLower<MT5>::value )
2888 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2889 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2890 :( IsLower<MT5>::value ? j : 0UL ) );
2891 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
2893 IntrinsicType xmm1( (~C).
load(i,j ) );
2896 for(
size_t k=kbegin; k<kend; ++k ) {
2897 const IntrinsicType a1(
set( A(i,k) ) );
2898 xmm1 = xmm1 + a1 * B.load(k,j );
2899 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
2902 (~C).
store( i, j , xmm1 );
2911 for( ; (i+2UL) <= M; i+=2UL )
2913 const size_t kbegin( ( IsUpper<MT4>::value )
2914 ?( ( IsLower<MT5>::value )
2915 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2916 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2917 :( IsLower<MT5>::value ? j : 0UL ) );
2918 const size_t kend( ( IsLower<MT4>::value )
2919 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2922 IntrinsicType xmm1( (~C).
load(i ,j) );
2923 IntrinsicType xmm2( (~C).
load(i+1UL,j) );
2925 for(
size_t k=kbegin; k<kend; ++k ) {
2926 const IntrinsicType b1( B.load(k,j) );
2927 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2928 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2931 (~C).
store( i , j, xmm1 );
2932 (~C).
store( i+1UL, j, xmm2 );
2937 const size_t kbegin( ( IsUpper<MT4>::value )
2938 ?( ( IsLower<MT5>::value )
2939 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2940 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2941 :( IsLower<MT5>::value ? j : 0UL ) );
2943 IntrinsicType xmm1( (~C).
load(i,j) );
2945 for(
size_t k=kbegin; k<K; ++k ) {
2946 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2949 (~C).
store( i, j, xmm1 );
2971 template<
typename MT3
2974 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2975 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2977 typedef IntrinsicTrait<ElementType> IT;
2979 const size_t M( A.rows() );
2980 const size_t N( B.columns() );
2981 const size_t K( A.columns() );
2986 for(
size_t j=0UL; j<N; ++j )
2988 const size_t kbegin( ( IsLower<MT5>::value )
2989 ?( ( IsUpper<MT4>::value )
2990 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2991 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2992 :( IsUpper<MT4>::value ? i : 0UL ) );
2993 const size_t kend( ( IsUpper<MT5>::value )
2994 ?( ( IsLower<MT4>::value )
2995 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
2996 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
2997 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
2999 IntrinsicType xmm1( (~C).
load(i ,j) );
3008 for(
size_t k=kbegin; k<kend; ++k ) {
3009 const IntrinsicType b1(
set( B(k,j) ) );
3010 xmm1 = xmm1 + A.load(i ,k) * b1;
3011 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3012 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3013 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3014 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
3015 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
3016 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
3017 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
3020 (~C).
store( i , j, xmm1 );
3035 for( ; (j+2UL) <= N; j+=2UL )
3037 const size_t kbegin( ( IsLower<MT5>::value )
3038 ?( ( IsUpper<MT4>::value )
3039 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3040 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3041 :( IsUpper<MT4>::value ? i : 0UL ) );
3042 const size_t kend( ( IsUpper<MT5>::value )
3043 ?( ( IsLower<MT4>::value )
3044 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3045 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3046 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
3048 IntrinsicType xmm1( (~C).
load(i ,j ) );
3052 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
3057 for(
size_t k=kbegin; k<kend; ++k ) {
3058 const IntrinsicType a1( A.load(i ,k) );
3059 const IntrinsicType a2( A.load(i+
IT::size ,k) );
3060 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
3061 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
3062 const IntrinsicType b1(
set( B(k,j ) ) );
3063 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3064 xmm1 = xmm1 + a1 * b1;
3065 xmm2 = xmm2 + a2 * b1;
3066 xmm3 = xmm3 + a3 * b1;
3067 xmm4 = xmm4 + a4 * b1;
3068 xmm5 = xmm5 + a1 * b2;
3069 xmm6 = xmm6 + a2 * b2;
3070 xmm7 = xmm7 + a3 * b2;
3071 xmm8 = xmm8 + a4 * b2;
3074 (~C).
store( i , j , xmm1 );
3078 (~C).
store( i , j+1UL, xmm5 );
3086 const size_t kbegin( ( IsLower<MT5>::value )
3087 ?( ( IsUpper<MT4>::value )
3088 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3089 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3090 :( IsUpper<MT4>::value ? i : 0UL ) );
3091 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
3093 IntrinsicType xmm1( (~C).
load(i ,j) );
3098 for(
size_t k=kbegin; k<kend; ++k ) {
3099 const IntrinsicType b1(
set( B(k,j) ) );
3100 xmm1 = xmm1 + A.load(i ,k) * b1;
3101 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
3102 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
3103 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
3106 (~C).
store( i , j, xmm1 );
3117 for( ; (j+2UL) <= N; j+=2UL )
3119 const size_t kbegin( ( IsLower<MT5>::value )
3120 ?( ( IsUpper<MT4>::value )
3121 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3122 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3123 :( IsUpper<MT4>::value ? i : 0UL ) );
3124 const size_t kend( ( IsUpper<MT5>::value )
3125 ?( ( IsLower<MT4>::value )
3126 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3127 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3128 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
3130 IntrinsicType xmm1( (~C).
load(i ,j ) );
3132 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3135 for(
size_t k=kbegin; k<kend; ++k ) {
3136 const IntrinsicType a1( A.load(i ,k) );
3137 const IntrinsicType a2( A.load(i+
IT::size,k) );
3138 const IntrinsicType b1(
set( B(k,j ) ) );
3139 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3140 xmm1 = xmm1 + a1 * b1;
3141 xmm2 = xmm2 + a2 * b1;
3142 xmm3 = xmm3 + a1 * b2;
3143 xmm4 = xmm4 + a2 * b2;
3146 (~C).
store( i , j , xmm1 );
3148 (~C).
store( i , j+1UL, xmm3 );
3154 const size_t kbegin( ( IsLower<MT5>::value )
3155 ?( ( IsUpper<MT4>::value )
3156 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3157 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3158 :( IsUpper<MT4>::value ? i : 0UL ) );
3159 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
3161 IntrinsicType xmm1( (~C).
load(i ,j) );
3164 for(
size_t k=kbegin; k<kend; ++k ) {
3165 const IntrinsicType b1(
set( B(k,j) ) );
3166 xmm1 = xmm1 + A.load(i ,k) * b1;
3167 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
3170 (~C).
store( i , j, xmm1 );
3179 for( ; (j+2UL) <= N; j+=2UL )
3181 const size_t kbegin( ( IsLower<MT5>::value )
3182 ?( ( IsUpper<MT4>::value )
3183 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3184 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3185 :( IsUpper<MT4>::value ? i : 0UL ) );
3186 const size_t kend( ( IsUpper<MT5>::value )
3187 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3190 IntrinsicType xmm1( (~C).
load(i,j ) );
3191 IntrinsicType xmm2( (~C).
load(i,j+1UL) );
3193 for(
size_t k=kbegin; k<kend; ++k ) {
3194 const IntrinsicType a1( A.load(i,k) );
3195 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3196 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3199 (~C).
store( i, j , xmm1 );
3200 (~C).
store( i, j+1UL, xmm2 );
3205 const size_t kbegin( ( IsLower<MT5>::value )
3206 ?( ( IsUpper<MT4>::value )
3207 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3208 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3209 :( IsUpper<MT4>::value ? i : 0UL ) );
3211 IntrinsicType xmm1( (~C).
load(i,j) );
3213 for(
size_t k=kbegin; k<K; ++k ) {
3214 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3217 (~C).
store( i, j, xmm1 );
3238 template<
typename MT3
3241 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3242 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3244 selectDefaultAddAssignKernel( C, A, B );
3264 template<
typename MT3
3267 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3268 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3270 typedef IntrinsicTrait<ElementType> IT;
3272 const size_t M( A.rows() );
3273 const size_t N( B.columns() );
3274 const size_t K( A.columns() );
3276 const size_t iblock( 64UL );
3277 const size_t jblock( 128UL );
3278 const size_t kblock( 128UL );
3280 for(
size_t jj=0UL; jj<N; jj+=jblock )
3282 const size_t jend(
min( jj+jblock, N ) );
3284 for(
size_t ii=0UL; ii<M; ii+=iblock )
3286 const size_t iend(
min( ii+iblock, M ) );
3288 for(
size_t kk=0UL; kk<K; kk+=kblock )
3290 const size_t ktmp(
min( kk+kblock, K ) );
3302 for( ; (i+2UL) <= iend; i+=2UL )
3304 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3305 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3306 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3307 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3309 IntrinsicType xmm1( (~C).
load(i ,j ) );
3310 IntrinsicType xmm2( (~C).
load(i ,j1) );
3311 IntrinsicType xmm3( (~C).
load(i ,j2) );
3312 IntrinsicType xmm4( (~C).
load(i ,j3) );
3313 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
3314 IntrinsicType xmm6( (~C).
load(i+1UL,j1) );
3315 IntrinsicType xmm7( (~C).
load(i+1UL,j2) );
3316 IntrinsicType xmm8( (~C).
load(i+1UL,j3) );
3318 for(
size_t k=kbegin; k<kend; ++k ) {
3319 const IntrinsicType a1(
set( A(i ,k) ) );
3320 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3321 const IntrinsicType b1( B.load(k,j ) );
3322 const IntrinsicType b2( B.load(k,j1) );
3323 const IntrinsicType b3( B.load(k,j2) );
3324 const IntrinsicType b4( B.load(k,j3) );
3325 xmm1 = xmm1 + a1 * b1;
3326 xmm2 = xmm2 + a1 * b2;
3327 xmm3 = xmm3 + a1 * b3;
3328 xmm4 = xmm4 + a1 * b4;
3329 xmm5 = xmm5 + a2 * b1;
3330 xmm6 = xmm6 + a2 * b2;
3331 xmm7 = xmm7 + a2 * b3;
3332 xmm8 = xmm8 + a2 * b4;
3335 (~C).
store( i , j , xmm1 );
3336 (~C).
store( i , j1, xmm2 );
3337 (~C).
store( i , j2, xmm3 );
3338 (~C).
store( i , j3, xmm4 );
3339 (~C).
store( i+1UL, j , xmm5 );
3340 (~C).
store( i+1UL, j1, xmm6 );
3341 (~C).
store( i+1UL, j2, xmm7 );
3342 (~C).
store( i+1UL, j3, xmm8 );
3347 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3348 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3349 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3350 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3352 IntrinsicType xmm1( (~C).
load(i,j ) );
3353 IntrinsicType xmm2( (~C).
load(i,j1) );
3354 IntrinsicType xmm3( (~C).
load(i,j2) );
3355 IntrinsicType xmm4( (~C).
load(i,j3) );
3357 for(
size_t k=kbegin; k<kend; ++k ) {
3358 const IntrinsicType a1(
set( A(i,k) ) );
3359 xmm1 = xmm1 + a1 * B.load(k,j );
3360 xmm2 = xmm2 + a1 * B.load(k,j1);
3361 xmm3 = xmm3 + a1 * B.load(k,j2);
3362 xmm4 = xmm4 + a1 * B.load(k,j3);
3365 (~C).
store( i, j , xmm1 );
3366 (~C).
store( i, j1, xmm2 );
3367 (~C).
store( i, j2, xmm3 );
3368 (~C).
store( i, j3, xmm4 );
3378 for( ; (i+4UL) <= iend; i+=4UL )
3380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3382 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3383 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3385 IntrinsicType xmm1( (~C).
load(i ,j ) );
3386 IntrinsicType xmm2( (~C).
load(i ,j1) );
3387 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
3388 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
3389 IntrinsicType xmm5( (~C).
load(i+2UL,j ) );
3390 IntrinsicType xmm6( (~C).
load(i+2UL,j1) );
3391 IntrinsicType xmm7( (~C).
load(i+3UL,j ) );
3392 IntrinsicType xmm8( (~C).
load(i+3UL,j1) );
3394 for(
size_t k=kbegin; k<kend; ++k ) {
3395 const IntrinsicType a1(
set( A(i ,k) ) );
3396 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3397 const IntrinsicType a3(
set( A(i+2UL,k) ) );
3398 const IntrinsicType a4(
set( A(i+3UL,k) ) );
3399 const IntrinsicType b1( B.load(k,j ) );
3400 const IntrinsicType b2( B.load(k,j1) );
3401 xmm1 = xmm1 + a1 * b1;
3402 xmm2 = xmm2 + a1 * b2;
3403 xmm3 = xmm3 + a2 * b1;
3404 xmm4 = xmm4 + a2 * b2;
3405 xmm5 = xmm5 + a3 * b1;
3406 xmm6 = xmm6 + a3 * b2;
3407 xmm7 = xmm7 + a4 * b1;
3408 xmm8 = xmm8 + a4 * b2;
3411 (~C).
store( i , j , xmm1 );
3412 (~C).
store( i , j1, xmm2 );
3413 (~C).
store( i+1UL, j , xmm3 );
3414 (~C).
store( i+1UL, j1, xmm4 );
3415 (~C).
store( i+2UL, j , xmm5 );
3416 (~C).
store( i+2UL, j1, xmm6 );
3417 (~C).
store( i+3UL, j , xmm7 );
3418 (~C).
store( i+3UL, j1, xmm8 );
3421 for( ; (i+2UL) <= iend; i+=2UL )
3423 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3424 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3425 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3426 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3428 IntrinsicType xmm1( (~C).
load(i ,j ) );
3429 IntrinsicType xmm2( (~C).
load(i ,j1) );
3430 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
3431 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
3433 for(
size_t k=kbegin; k<kend; ++k ) {
3434 const IntrinsicType a1(
set( A(i ,k) ) );
3435 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3436 const IntrinsicType b1( B.load(k,j ) );
3437 const IntrinsicType b2( B.load(k,j1) );
3438 xmm1 = xmm1 + a1 * b1;
3439 xmm2 = xmm2 + a1 * b2;
3440 xmm3 = xmm3 + a2 * b1;
3441 xmm4 = xmm4 + a2 * b2;
3444 (~C).
store( i , j , xmm1 );
3445 (~C).
store( i , j1, xmm2 );
3446 (~C).
store( i+1UL, j , xmm3 );
3447 (~C).
store( i+1UL, j1, xmm4 );
3452 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3453 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3454 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3455 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3457 IntrinsicType xmm1( (~C).
load(i,j ) );
3458 IntrinsicType xmm2( (~C).
load(i,j1) );
3460 for(
size_t k=kbegin; k<kend; ++k ) {
3461 const IntrinsicType a1(
set( A(i,k) ) );
3462 xmm1 = xmm1 + a1 * B.load(k,j );
3463 xmm2 = xmm2 + a1 * B.load(k,j1);
3466 (~C).
store( i, j , xmm1 );
3467 (~C).
store( i, j1, xmm2 );
3473 for(
size_t i=ii; i<iend; ++i )
3475 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3476 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3477 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3478 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
3480 IntrinsicType xmm1( (~C).
load(i,j) );
3482 for(
size_t k=kbegin; k<kend; ++k ) {
3483 const IntrinsicType a1(
set( A(i,k) ) );
3484 xmm1 = xmm1 + a1 * B.load(k,j);
3487 (~C).
store( i, j, xmm1 );
3512 template<
typename MT3
3515 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3516 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3518 typedef IntrinsicTrait<ElementType> IT;
3520 const size_t M( A.rows() );
3521 const size_t N( B.columns() );
3522 const size_t K( A.columns() );
3524 const size_t iblock( 128UL );
3525 const size_t jblock( 64UL );
3526 const size_t kblock( 128UL );
3528 for(
size_t ii=0UL; ii<M; ii+=iblock )
3530 const size_t iend(
min( ii+iblock, M ) );
3532 for(
size_t jj=0UL; jj<N; jj+=jblock )
3534 const size_t jend(
min( jj+jblock, N ) );
3536 for(
size_t kk=0UL; kk<K; kk+=kblock )
3538 const size_t ktmp(
min( kk+kblock, K ) );
3550 for( ; (j+2UL) <= jend; j+=2UL )
3552 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3553 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3554 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3555 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3557 IntrinsicType xmm1( (~C).
load(i ,j ) );
3558 IntrinsicType xmm2( (~C).
load(i1,j ) );
3559 IntrinsicType xmm3( (~C).
load(i2,j ) );
3560 IntrinsicType xmm4( (~C).
load(i3,j ) );
3561 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
3562 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
3563 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
3564 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
3566 for(
size_t k=kbegin; k<kend; ++k ) {
3567 const IntrinsicType a1( A.load(i ,k) );
3568 const IntrinsicType a2( A.load(i1,k) );
3569 const IntrinsicType a3( A.load(i2,k) );
3570 const IntrinsicType a4( A.load(i3,k) );
3571 const IntrinsicType b1(
set( B(k,j ) ) );
3572 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3573 xmm1 = xmm1 + a1 * b1;
3574 xmm2 = xmm2 + a2 * b1;
3575 xmm3 = xmm3 + a3 * b1;
3576 xmm4 = xmm4 + a4 * b1;
3577 xmm5 = xmm5 + a1 * b2;
3578 xmm6 = xmm6 + a2 * b2;
3579 xmm7 = xmm7 + a3 * b2;
3580 xmm8 = xmm8 + a4 * b2;
3583 (~C).
store( i , j , xmm1 );
3584 (~C).
store( i1, j , xmm2 );
3585 (~C).
store( i2, j , xmm3 );
3586 (~C).
store( i3, j , xmm4 );
3587 (~C).
store( i , j+1UL, xmm5 );
3588 (~C).
store( i1, j+1UL, xmm6 );
3589 (~C).
store( i2, j+1UL, xmm7 );
3590 (~C).
store( i3, j+1UL, xmm8 );
3595 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3596 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3597 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3598 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3600 IntrinsicType xmm1( (~C).
load(i ,j) );
3601 IntrinsicType xmm2( (~C).
load(i1,j) );
3602 IntrinsicType xmm3( (~C).
load(i2,j) );
3603 IntrinsicType xmm4( (~C).
load(i3,j) );
3605 for(
size_t k=kbegin; k<kend; ++k ) {
3606 const IntrinsicType b1(
set( B(k,j) ) );
3607 xmm1 = xmm1 + A.load(i ,k) * b1;
3608 xmm2 = xmm2 + A.load(i1,k) * b1;
3609 xmm3 = xmm3 + A.load(i2,k) * b1;
3610 xmm4 = xmm4 + A.load(i3,k) * b1;
3613 (~C).
store( i , j, xmm1 );
3614 (~C).
store( i1, j, xmm2 );
3615 (~C).
store( i2, j, xmm3 );
3616 (~C).
store( i3, j, xmm4 );
3626 for( ; (j+4UL) <= jend; j+=4UL )
3628 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3629 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3630 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3631 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3633 IntrinsicType xmm1( (~C).
load(i ,j ) );
3634 IntrinsicType xmm2( (~C).
load(i1,j ) );
3635 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3636 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
3637 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
3638 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
3639 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
3640 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
3642 for(
size_t k=kbegin; k<kend; ++k ) {
3643 const IntrinsicType a1( A.load(i ,k) );
3644 const IntrinsicType a2( A.load(i1,k) );
3645 const IntrinsicType b1(
set( B(k,j ) ) );
3646 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3647 const IntrinsicType b3(
set( B(k,j+2UL) ) );
3648 const IntrinsicType b4(
set( B(k,j+3UL) ) );
3649 xmm1 = xmm1 + a1 * b1;
3650 xmm2 = xmm2 + a2 * b1;
3651 xmm3 = xmm3 + a1 * b2;
3652 xmm4 = xmm4 + a2 * b2;
3653 xmm5 = xmm5 + a1 * b3;
3654 xmm6 = xmm6 + a2 * b3;
3655 xmm7 = xmm7 + a1 * b4;
3656 xmm8 = xmm8 + a2 * b4;
3659 (~C).
store( i , j , xmm1 );
3660 (~C).
store( i1, j , xmm2 );
3661 (~C).
store( i , j+1UL, xmm3 );
3662 (~C).
store( i1, j+1UL, xmm4 );
3663 (~C).
store( i , j+2UL, xmm5 );
3664 (~C).
store( i1, j+2UL, xmm6 );
3665 (~C).
store( i , j+3UL, xmm7 );
3666 (~C).
store( i1, j+3UL, xmm8 );
3669 for( ; (j+2UL) <= jend; j+=2UL )
3671 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3672 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3673 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3674 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3676 IntrinsicType xmm1( (~C).
load(i ,j ) );
3677 IntrinsicType xmm2( (~C).
load(i1,j ) );
3678 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
3679 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
3681 for(
size_t k=kbegin; k<kend; ++k ) {
3682 const IntrinsicType a1( A.load(i ,k) );
3683 const IntrinsicType a2( A.load(i1,k) );
3684 const IntrinsicType b1(
set( B(k,j ) ) );
3685 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3686 xmm1 = xmm1 + a1 * b1;
3687 xmm2 = xmm2 + a2 * b1;
3688 xmm3 = xmm3 + a1 * b2;
3689 xmm4 = xmm4 + a2 * b2;
3692 (~C).
store( i , j , xmm1 );
3693 (~C).
store( i1, j , xmm2 );
3694 (~C).
store( i , j+1UL, xmm3 );
3695 (~C).
store( i1, j+1UL, xmm4 );
3700 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3701 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3702 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3703 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3705 IntrinsicType xmm1( (~C).
load(i ,j) );
3706 IntrinsicType xmm2( (~C).
load(i1,j) );
3708 for(
size_t k=kbegin; k<kend; ++k ) {
3709 const IntrinsicType b1(
set( B(k,j) ) );
3710 xmm1 = xmm1 + A.load(i ,k) * b1;
3711 xmm2 = xmm2 + A.load(i1,k) * b1;
3714 (~C).
store( i , j, xmm1 );
3715 (~C).
store( i1, j, xmm2 );
3721 for(
size_t j=jj; j<jend; ++j )
3723 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3724 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3725 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
3726 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3728 IntrinsicType xmm1( (~C).
load(i,j) );
3730 for(
size_t k=kbegin; k<kend; ++k ) {
3731 const IntrinsicType b1(
set( B(k,j) ) );
3732 xmm1 = xmm1 + A.load(i,k) * b1;
3735 (~C).
store( i, j, xmm1 );
3759 template<
typename MT3
3762 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
3763 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3765 selectLargeAddAssignKernel( C, A, B );
3785 template<
typename MT3
3788 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
3789 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3791 if( IsTriangular<MT4>::value ) {
3793 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3796 else if( IsTriangular<MT5>::value ) {
3798 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
3802 sgemm( C, A, B, 1.0F, 1.0F );
3824 template<
typename MT3
3827 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
3828 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3830 if( IsTriangular<MT4>::value ) {
3832 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3835 else if( IsTriangular<MT5>::value ) {
3837 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
3841 dgemm( C, A, B, 1.0, 1.0 );
3863 template<
typename MT3
3866 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3867 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3869 if( IsTriangular<MT4>::value ) {
3871 ctrmm( tmp, A, CblasLeft,
3872 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3873 complex<float>( 1.0F, 0.0F ) );
3876 else if( IsTriangular<MT5>::value ) {
3878 ctrmm( tmp, B, CblasRight,
3879 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3880 complex<float>( 1.0F, 0.0F ) );
3884 cgemm( C, A, B, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
3906 template<
typename MT3
3909 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
3910 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3912 if( IsTriangular<MT4>::value ) {
3914 ztrmm( tmp, A, CblasLeft,
3915 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
3916 complex<double>( 1.0, 0.0 ) );
3919 else if( IsTriangular<MT5>::value ) {
3921 ztrmm( tmp, B, CblasRight,
3922 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
3923 complex<double>( 1.0, 0.0 ) );
3927 zgemm( C, A, B, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
3951 template<
typename MT
3960 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3964 LT A(
serial( rhs.lhs_ ) );
3965 RT B(
serial( rhs.rhs_ ) );
3974 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3990 template<
typename MT3
3993 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3995 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3997 selectSmallSubAssignKernel( C, A, B );
3999 selectBlasSubAssignKernel( C, A, B );
4018 template<
typename MT3
4021 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4022 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4024 const size_t M( A.rows() );
4025 const size_t N( B.columns() );
4026 const size_t K( A.columns() );
4028 for(
size_t i=0UL; i<M; ++i )
4030 const size_t kbegin( ( IsUpper<MT4>::value )
4031 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4033 const size_t kend( ( IsLower<MT4>::value )
4034 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4038 for(
size_t k=kbegin; k<kend; ++k )
4040 const size_t jbegin( ( IsUpper<MT5>::value )
4041 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4043 const size_t jend( ( IsLower<MT5>::value )
4044 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4048 const size_t jnum( jend - jbegin );
4049 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4051 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4052 (~C)(i,j ) -= A(i,k) * B(k,j );
4053 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4056 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4078 template<
typename MT3
4081 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4082 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4084 const size_t M( A.rows() );
4085 const size_t N( B.columns() );
4086 const size_t K( A.columns() );
4088 for(
size_t j=0UL; j<N; ++j )
4090 const size_t kbegin( ( IsLower<MT5>::value )
4091 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4093 const size_t kend( ( IsUpper<MT5>::value )
4094 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4098 for(
size_t k=kbegin; k<kend; ++k )
4100 const size_t ibegin( ( IsLower<MT4>::value )
4101 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4103 const size_t iend( ( IsUpper<MT4>::value )
4104 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4108 const size_t inum( iend - ibegin );
4109 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4111 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4112 (~C)(i ,j) -= A(i ,k) * B(k,j);
4113 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4116 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4138 template<
typename MT3
4141 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4142 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4144 const size_t M( A.rows() );
4145 const size_t N( B.columns() );
4147 const size_t block( 16UL );
4149 for(
size_t ii=0UL; ii<M; ii+=block ) {
4150 const size_t iend(
min( M, ii+block ) );
4151 for(
size_t jj=0UL; jj<N; jj+=block ) {
4152 const size_t jend(
min( N, jj+block ) );
4153 for(
size_t i=ii; i<iend; ++i )
4155 const size_t jbegin( ( IsUpper<MT4>::value )
4156 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4158 const size_t jpos( ( IsLower<MT4>::value )
4159 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4162 for(
size_t j=jbegin; j<jpos; ++j ) {
4163 (~C)(i,j) -= A(i,j) * B(j,j);
4186 template<
typename MT3
4189 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4190 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4192 const size_t M( A.rows() );
4193 const size_t N( B.columns() );
4195 for(
size_t j=0UL; j<N; ++j )
4197 const size_t ibegin( ( IsLower<MT4>::value )
4198 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4200 const size_t iend( ( IsUpper<MT4>::value )
4201 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4205 const size_t inum( iend - ibegin );
4206 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4208 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4209 (~C)(i ,j) -= A(i ,j) * B(j,j);
4210 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4213 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4234 template<
typename MT3
4237 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4238 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4240 const size_t M( A.rows() );
4241 const size_t N( B.columns() );
4243 for(
size_t i=0UL; i<M; ++i )
4245 const size_t jbegin( ( IsUpper<MT5>::value )
4246 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4248 const size_t jend( ( IsLower<MT5>::value )
4249 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4253 const size_t jnum( jend - jbegin );
4254 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4256 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4257 (~C)(i,j ) -= A(i,i) * B(i,j );
4258 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4261 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4282 template<
typename MT3
4285 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4286 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4288 const size_t M( A.rows() );
4289 const size_t N( B.columns() );
4291 const size_t block( 16UL );
4293 for(
size_t jj=0UL; jj<N; jj+=block ) {
4294 const size_t jend(
min( N, jj+block ) );
4295 for(
size_t ii=0UL; ii<M; ii+=block ) {
4296 const size_t iend(
min( M, ii+block ) );
4297 for(
size_t j=jj; j<jend; ++j )
4299 const size_t ibegin( ( IsLower<MT5>::value )
4300 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4302 const size_t ipos( ( IsUpper<MT5>::value )
4303 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4306 for(
size_t i=ibegin; i<ipos; ++i ) {
4307 (~C)(i,j) -= A(i,i) * B(i,j);
4330 template<
typename MT3
4333 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4334 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4336 for(
size_t i=0UL; i<A.rows(); ++i ) {
4337 C(i,i) -= A(i,i) * B(i,i);
4357 template<
typename MT3
4360 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4361 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4363 selectDefaultSubAssignKernel( C, A, B );
4383 template<
typename MT3
4386 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4387 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4389 typedef IntrinsicTrait<ElementType> IT;
4391 const size_t M( A.rows() );
4392 const size_t N( B.columns() );
4393 const size_t K( A.columns() );
4398 for(
size_t i=0UL; i<M; ++i )
4400 const size_t kbegin( ( IsUpper<MT4>::value )
4401 ?( ( IsLower<MT5>::value )
4402 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4403 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4404 :( IsLower<MT5>::value ? j : 0UL ) );
4405 const size_t kend( ( IsLower<MT4>::value )
4406 ?( ( IsUpper<MT5>::value )
4407 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
4408 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4409 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
4411 IntrinsicType xmm1( (~C).
load(i,j ) );
4420 for(
size_t k=kbegin; k<kend; ++k ) {
4421 const IntrinsicType a1(
set( A(i,k) ) );
4422 xmm1 = xmm1 - a1 * B.load(k,j );
4423 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
4424 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
4425 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
4426 xmm5 = xmm5 - a1 * B.load(k,j+
IT::size*4UL);
4427 xmm6 = xmm6 - a1 * B.load(k,j+
IT::size*5UL);
4428 xmm7 = xmm7 - a1 * B.load(k,j+
IT::size*6UL);
4429 xmm8 = xmm8 - a1 * B.load(k,j+
IT::size*7UL);
4432 (~C).
store( i, j , xmm1 );
4447 for( ; (i+2UL) <= M; i+=2UL )
4449 const size_t kbegin( ( IsUpper<MT4>::value )
4450 ?( ( IsLower<MT5>::value )
4451 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4452 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4453 :( IsLower<MT5>::value ? j : 0UL ) );
4454 const size_t kend( ( IsLower<MT4>::value )
4455 ?( ( IsUpper<MT5>::value )
4456 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
4457 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4458 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
4460 IntrinsicType xmm1( (~C).
load(i ,j ) );
4464 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
4469 for(
size_t k=kbegin; k<kend; ++k ) {
4470 const IntrinsicType a1(
set( A(i ,k) ) );
4471 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4472 const IntrinsicType b1( B.load(k,j ) );
4473 const IntrinsicType b2( B.load(k,j+
IT::size ) );
4474 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
4475 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
4476 xmm1 = xmm1 - a1 * b1;
4477 xmm2 = xmm2 - a1 * b2;
4478 xmm3 = xmm3 - a1 * b3;
4479 xmm4 = xmm4 - a1 * b4;
4480 xmm5 = xmm5 - a2 * b1;
4481 xmm6 = xmm6 - a2 * b2;
4482 xmm7 = xmm7 - a2 * b3;
4483 xmm8 = xmm8 - a2 * b4;
4486 (~C).
store( i , j , xmm1 );
4490 (~C).
store( i+1UL, j , xmm5 );
4498 const size_t kbegin( ( IsUpper<MT4>::value )
4499 ?( ( IsLower<MT5>::value )
4500 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4501 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4502 :( IsLower<MT5>::value ? j : 0UL ) );
4503 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
4505 IntrinsicType xmm1( (~C).
load(i,j ) );
4510 for(
size_t k=kbegin; k<kend; ++k ) {
4511 const IntrinsicType a1(
set( A(i,k) ) );
4512 xmm1 = xmm1 - a1 * B.load(k,j );
4513 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
4514 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
4515 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
4518 (~C).
store( i, j , xmm1 );
4529 for( ; (i+2UL) <= M; i+=2UL )
4531 const size_t kbegin( ( IsUpper<MT4>::value )
4532 ?( ( IsLower<MT5>::value )
4533 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4534 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4535 :( IsLower<MT5>::value ? j : 0UL ) );
4536 const size_t kend( ( IsLower<MT4>::value )
4537 ?( ( IsUpper<MT5>::value )
4538 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
4539 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4540 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
4542 IntrinsicType xmm1( (~C).
load(i ,j ) );
4544 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
4547 for(
size_t k=kbegin; k<kend; ++k ) {
4548 const IntrinsicType a1(
set( A(i ,k) ) );
4549 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4550 const IntrinsicType b1( B.load(k,j ) );
4551 const IntrinsicType b2( B.load(k,j+
IT::size) );
4552 xmm1 = xmm1 - a1 * b1;
4553 xmm2 = xmm2 - a1 * b2;
4554 xmm3 = xmm3 - a2 * b1;
4555 xmm4 = xmm4 - a2 * b2;
4558 (~C).
store( i , j , xmm1 );
4560 (~C).
store( i+1UL, j , xmm3 );
4566 const size_t kbegin( ( IsUpper<MT4>::value )
4567 ?( ( IsLower<MT5>::value )
4568 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4569 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4570 :( IsLower<MT5>::value ? j : 0UL ) );
4571 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
4573 IntrinsicType xmm1( (~C).
load(i,j ) );
4576 for(
size_t k=kbegin; k<kend; ++k ) {
4577 const IntrinsicType a1(
set( A(i,k) ) );
4578 xmm1 = xmm1 - a1 * B.load(k,j );
4579 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size);
4582 (~C).
store( i, j , xmm1 );
4591 for( ; (i+2UL) <= M; i+=2UL )
4593 const size_t kbegin( ( IsUpper<MT4>::value )
4594 ?( ( IsLower<MT5>::value )
4595 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4596 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4597 :( IsLower<MT5>::value ? j : 0UL ) );
4598 const size_t kend( ( IsLower<MT4>::value )
4599 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4602 IntrinsicType xmm1( (~C).
load(i ,j) );
4603 IntrinsicType xmm2( (~C).
load(i+1UL,j) );
4605 for(
size_t k=kbegin; k<kend; ++k ) {
4606 const IntrinsicType b1( B.load(k,j) );
4607 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
4608 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
4611 (~C).
store( i , j, xmm1 );
4612 (~C).
store( i+1UL, j, xmm2 );
4617 const size_t kbegin( ( IsUpper<MT4>::value )
4618 ?( ( IsLower<MT5>::value )
4619 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4620 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4621 :( IsLower<MT5>::value ? j : 0UL ) );
4623 IntrinsicType xmm1( (~C).
load(i,j) );
4625 for(
size_t k=kbegin; k<K; ++k ) {
4626 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
4629 (~C).
store( i, j, xmm1 );
4651 template<
typename MT3
4654 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4655 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4657 typedef IntrinsicTrait<ElementType> IT;
4659 const size_t M( A.rows() );
4660 const size_t N( B.columns() );
4661 const size_t K( A.columns() );
4666 for(
size_t j=0UL; j<N; ++j )
4668 const size_t kbegin( ( IsLower<MT5>::value )
4669 ?( ( IsUpper<MT4>::value )
4670 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4671 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4672 :( IsUpper<MT4>::value ? i : 0UL ) );
4673 const size_t kend( ( IsUpper<MT5>::value )
4674 ?( ( IsLower<MT4>::value )
4675 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4676 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4677 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
4679 IntrinsicType xmm1( (~C).
load(i ,j) );
4688 for(
size_t k=kbegin; k<kend; ++k ) {
4689 const IntrinsicType b1(
set( B(k,j) ) );
4690 xmm1 = xmm1 - A.load(i ,k) * b1;
4691 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
4692 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
4693 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
4694 xmm5 = xmm5 - A.load(i+
IT::size*4UL,k) * b1;
4695 xmm6 = xmm6 - A.load(i+
IT::size*5UL,k) * b1;
4696 xmm7 = xmm7 - A.load(i+
IT::size*6UL,k) * b1;
4697 xmm8 = xmm8 - A.load(i+
IT::size*7UL,k) * b1;
4700 (~C).
store( i , j, xmm1 );
4715 for( ; (j+2UL) <= N; j+=2UL )
4717 const size_t kbegin( ( IsLower<MT5>::value )
4718 ?( ( IsUpper<MT4>::value )
4719 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4720 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4721 :( IsUpper<MT4>::value ? i : 0UL ) );
4722 const size_t kend( ( IsUpper<MT5>::value )
4723 ?( ( IsLower<MT4>::value )
4724 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4725 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4726 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
4728 IntrinsicType xmm1( (~C).
load(i ,j ) );
4732 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
4737 for(
size_t k=kbegin; k<kend; ++k ) {
4738 const IntrinsicType a1( A.load(i ,k) );
4739 const IntrinsicType a2( A.load(i+
IT::size ,k) );
4740 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
4741 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
4742 const IntrinsicType b1(
set( B(k,j ) ) );
4743 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4744 xmm1 = xmm1 - a1 * b1;
4745 xmm2 = xmm2 - a2 * b1;
4746 xmm3 = xmm3 - a3 * b1;
4747 xmm4 = xmm4 - a4 * b1;
4748 xmm5 = xmm5 - a1 * b2;
4749 xmm6 = xmm6 - a2 * b2;
4750 xmm7 = xmm7 - a3 * b2;
4751 xmm8 = xmm8 - a4 * b2;
4754 (~C).
store( i , j , xmm1 );
4758 (~C).
store( i , j+1UL, xmm5 );
4766 const size_t kbegin( ( IsLower<MT5>::value )
4767 ?( ( IsUpper<MT4>::value )
4768 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4769 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4770 :( IsUpper<MT4>::value ? i : 0UL ) );
4771 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
4773 IntrinsicType xmm1( (~C).
load(i ,j) );
4778 for(
size_t k=kbegin; k<kend; ++k ) {
4779 const IntrinsicType b1(
set( B(k,j) ) );
4780 xmm1 = xmm1 - A.load(i ,k) * b1;
4781 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
4782 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
4783 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
4786 (~C).
store( i , j, xmm1 );
4797 for( ; (j+2UL) <= N; j+=2UL )
4799 const size_t kbegin( ( IsLower<MT5>::value )
4800 ?( ( IsUpper<MT4>::value )
4801 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4802 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4803 :( IsUpper<MT4>::value ? i : 0UL ) );
4804 const size_t kend( ( IsUpper<MT5>::value )
4805 ?( ( IsLower<MT4>::value )
4806 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4807 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4808 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
4810 IntrinsicType xmm1( (~C).
load(i ,j ) );
4812 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
4815 for(
size_t k=kbegin; k<kend; ++k ) {
4816 const IntrinsicType a1( A.load(i ,k) );
4817 const IntrinsicType a2( A.load(i+
IT::size,k) );
4818 const IntrinsicType b1(
set( B(k,j ) ) );
4819 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4820 xmm1 = xmm1 - a1 * b1;
4821 xmm2 = xmm2 - a2 * b1;
4822 xmm3 = xmm3 - a1 * b2;
4823 xmm4 = xmm4 - a2 * b2;
4826 (~C).
store( i , j , xmm1 );
4828 (~C).
store( i , j+1UL, xmm3 );
4834 const size_t kbegin( ( IsLower<MT5>::value )
4835 ?( ( IsUpper<MT4>::value )
4836 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4837 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4838 :( IsUpper<MT4>::value ? i : 0UL ) );
4839 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
4841 IntrinsicType xmm1( (~C).
load(i ,j) );
4844 for(
size_t k=kbegin; k<kend; ++k ) {
4845 const IntrinsicType b1(
set( B(k,j) ) );
4846 xmm1 = xmm1 - A.load(i ,k) * b1;
4847 xmm2 = xmm2 - A.load(i+
IT::size,k) * b1;
4850 (~C).
store( i , j, xmm1 );
4859 for( ; (j+2UL) <= N; j+=2UL )
4861 const size_t kbegin( ( IsLower<MT5>::value )
4862 ?( ( IsUpper<MT4>::value )
4863 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4864 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4865 :( IsUpper<MT4>::value ? i : 0UL ) );
4866 const size_t kend( ( IsUpper<MT5>::value )
4867 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4870 IntrinsicType xmm1( (~C).
load(i,j ) );
4871 IntrinsicType xmm2( (~C).
load(i,j+1UL) );
4873 for(
size_t k=kbegin; k<kend; ++k ) {
4874 const IntrinsicType a1( A.load(i,k) );
4875 xmm1 = xmm1 - a1 *
set( B(k,j ) );
4876 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
4879 (~C).
store( i, j , xmm1 );
4880 (~C).
store( i, j+1UL, xmm2 );
4885 const size_t kbegin( ( IsLower<MT5>::value )
4886 ?( ( IsUpper<MT4>::value )
4887 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4888 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4889 :( IsUpper<MT4>::value ? i : 0UL ) );
4891 IntrinsicType xmm1( (~C).
load(i,j) );
4893 for(
size_t k=kbegin; k<K; ++k ) {
4894 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
4897 (~C).
store( i, j, xmm1 );
4918 template<
typename MT3
4921 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4922 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4924 selectDefaultSubAssignKernel( C, A, B );
4944 template<
typename MT3
4947 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
4948 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4950 typedef IntrinsicTrait<ElementType> IT;
4952 const size_t M( A.rows() );
4953 const size_t N( B.columns() );
4954 const size_t K( A.columns() );
4956 const size_t iblock( 64UL );
4957 const size_t jblock( 128UL );
4958 const size_t kblock( 128UL );
4960 for(
size_t jj=0UL; jj<N; jj+=jblock )
4962 const size_t jend(
min( jj+jblock, N ) );
4964 for(
size_t ii=0UL; ii<M; ii+=iblock )
4966 const size_t iend(
min( ii+iblock, M ) );
4968 for(
size_t kk=0UL; kk<K; kk+=kblock )
4970 const size_t ktmp(
min( kk+kblock, K ) );
4982 for( ; (i+2UL) <= iend; i+=2UL )
4984 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
4985 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
4986 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
4987 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
4989 IntrinsicType xmm1( (~C).
load(i ,j ) );
4990 IntrinsicType xmm2( (~C).
load(i ,j1) );
4991 IntrinsicType xmm3( (~C).
load(i ,j2) );
4992 IntrinsicType xmm4( (~C).
load(i ,j3) );
4993 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
4994 IntrinsicType xmm6( (~C).
load(i+1UL,j1) );
4995 IntrinsicType xmm7( (~C).
load(i+1UL,j2) );
4996 IntrinsicType xmm8( (~C).
load(i+1UL,j3) );
4998 for(
size_t k=kbegin; k<kend; ++k ) {
4999 const IntrinsicType a1(
set( A(i ,k) ) );
5000 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5001 const IntrinsicType b1( B.load(k,j ) );
5002 const IntrinsicType b2( B.load(k,j1) );
5003 const IntrinsicType b3( B.load(k,j2) );
5004 const IntrinsicType b4( B.load(k,j3) );
5005 xmm1 = xmm1 - a1 * b1;
5006 xmm2 = xmm2 - a1 * b2;
5007 xmm3 = xmm3 - a1 * b3;
5008 xmm4 = xmm4 - a1 * b4;
5009 xmm5 = xmm5 - a2 * b1;
5010 xmm6 = xmm6 - a2 * b2;
5011 xmm7 = xmm7 - a2 * b3;
5012 xmm8 = xmm8 - a2 * b4;
5015 (~C).
store( i , j , xmm1 );
5016 (~C).
store( i , j1, xmm2 );
5017 (~C).
store( i , j2, xmm3 );
5018 (~C).
store( i , j3, xmm4 );
5019 (~C).
store( i+1UL, j , xmm5 );
5020 (~C).
store( i+1UL, j1, xmm6 );
5021 (~C).
store( i+1UL, j2, xmm7 );
5022 (~C).
store( i+1UL, j3, xmm8 );
5027 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5028 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5029 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5030 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
5032 IntrinsicType xmm1( (~C).
load(i,j ) );
5033 IntrinsicType xmm2( (~C).
load(i,j1) );
5034 IntrinsicType xmm3( (~C).
load(i,j2) );
5035 IntrinsicType xmm4( (~C).
load(i,j3) );
5037 for(
size_t k=kbegin; k<kend; ++k ) {
5038 const IntrinsicType a1(
set( A(i,k) ) );
5039 xmm1 = xmm1 - a1 * B.load(k,j );
5040 xmm2 = xmm2 - a1 * B.load(k,j1);
5041 xmm3 = xmm3 - a1 * B.load(k,j2);
5042 xmm4 = xmm4 - a1 * B.load(k,j3);
5045 (~C).
store( i, j , xmm1 );
5046 (~C).
store( i, j1, xmm2 );
5047 (~C).
store( i, j2, xmm3 );
5048 (~C).
store( i, j3, xmm4 );
5058 for( ; (i+4UL) <= iend; i+=4UL )
5060 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5061 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5062 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5063 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5065 IntrinsicType xmm1( (~C).
load(i ,j ) );
5066 IntrinsicType xmm2( (~C).
load(i ,j1) );
5067 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
5068 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
5069 IntrinsicType xmm5( (~C).
load(i+2UL,j ) );
5070 IntrinsicType xmm6( (~C).
load(i+2UL,j1) );
5071 IntrinsicType xmm7( (~C).
load(i+3UL,j ) );
5072 IntrinsicType xmm8( (~C).
load(i+3UL,j1) );
5074 for(
size_t k=kbegin; k<kend; ++k ) {
5075 const IntrinsicType a1(
set( A(i ,k) ) );
5076 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5077 const IntrinsicType a3(
set( A(i+2UL,k) ) );
5078 const IntrinsicType a4(
set( A(i+3UL,k) ) );
5079 const IntrinsicType b1( B.load(k,j ) );
5080 const IntrinsicType b2( B.load(k,j1) );
5081 xmm1 = xmm1 - a1 * b1;
5082 xmm2 = xmm2 - a1 * b2;
5083 xmm3 = xmm3 - a2 * b1;
5084 xmm4 = xmm4 - a2 * b2;
5085 xmm5 = xmm5 - a3 * b1;
5086 xmm6 = xmm6 - a3 * b2;
5087 xmm7 = xmm7 - a4 * b1;
5088 xmm8 = xmm8 - a4 * b2;
5091 (~C).
store( i , j , xmm1 );
5092 (~C).
store( i , j1, xmm2 );
5093 (~C).
store( i+1UL, j , xmm3 );
5094 (~C).
store( i+1UL, j1, xmm4 );
5095 (~C).
store( i+2UL, j , xmm5 );
5096 (~C).
store( i+2UL, j1, xmm6 );
5097 (~C).
store( i+3UL, j , xmm7 );
5098 (~C).
store( i+3UL, j1, xmm8 );
5101 for( ; (i+2UL) <= iend; i+=2UL )
5103 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5104 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5105 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5106 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5108 IntrinsicType xmm1( (~C).
load(i ,j ) );
5109 IntrinsicType xmm2( (~C).
load(i ,j1) );
5110 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
5111 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
5113 for(
size_t k=kbegin; k<kend; ++k ) {
5114 const IntrinsicType a1(
set( A(i ,k) ) );
5115 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5116 const IntrinsicType b1( B.load(k,j ) );
5117 const IntrinsicType b2( B.load(k,j1) );
5118 xmm1 = xmm1 - a1 * b1;
5119 xmm2 = xmm2 - a1 * b2;
5120 xmm3 = xmm3 - a2 * b1;
5121 xmm4 = xmm4 - a2 * b2;
5124 (~C).
store( i , j , xmm1 );
5125 (~C).
store( i , j1, xmm2 );
5126 (~C).
store( i+1UL, j , xmm3 );
5127 (~C).
store( i+1UL, j1, xmm4 );
5132 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5133 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5134 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5135 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5137 IntrinsicType xmm1( (~C).
load(i,j ) );
5138 IntrinsicType xmm2( (~C).
load(i,j1) );
5140 for(
size_t k=kbegin; k<kend; ++k ) {
5141 const IntrinsicType a1(
set( A(i,k) ) );
5142 xmm1 = xmm1 - a1 * B.load(k,j );
5143 xmm2 = xmm2 - a1 * B.load(k,j1);
5146 (~C).
store( i, j , xmm1 );
5147 (~C).
store( i, j1, xmm2 );
5153 for(
size_t i=ii; i<iend; ++i )
5155 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5156 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5157 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5158 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
5160 IntrinsicType xmm1( (~C).
load(i,j) );
5162 for(
size_t k=kbegin; k<kend; ++k ) {
5163 const IntrinsicType a1(
set( A(i,k) ) );
5164 xmm1 = xmm1 - a1 * B.load(k,j);
5167 (~C).
store( i, j, xmm1 );
5192 template<
typename MT3
5195 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
5196 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
5198 typedef IntrinsicTrait<ElementType> IT;
5200 const size_t M( A.rows() );
5201 const size_t N( B.columns() );
5202 const size_t K( A.columns() );
5204 const size_t iblock( 128UL );
5205 const size_t jblock( 64UL );
5206 const size_t kblock( 128UL );
5208 for(
size_t ii=0UL; ii<M; ii+=iblock )
5210 const size_t iend(
min( ii+iblock, M ) );
5212 for(
size_t jj=0UL; jj<N; jj+=jblock )
5214 const size_t jend(
min( jj+jblock, N ) );
5216 for(
size_t kk=0UL; kk<K; kk+=kblock )
5218 const size_t ktmp(
min( kk+kblock, K ) );
5230 for( ; (j+2UL) <= jend; j+=2UL )
5232 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5233 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5234 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5235 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5237 IntrinsicType xmm1( (~C).
load(i ,j ) );
5238 IntrinsicType xmm2( (~C).
load(i1,j ) );
5239 IntrinsicType xmm3( (~C).
load(i2,j ) );
5240 IntrinsicType xmm4( (~C).
load(i3,j ) );
5241 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
5242 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
5243 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
5244 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
5246 for(
size_t k=kbegin; k<kend; ++k ) {
5247 const IntrinsicType a1( A.load(i ,k) );
5248 const IntrinsicType a2( A.load(i1,k) );
5249 const IntrinsicType a3( A.load(i2,k) );
5250 const IntrinsicType a4( A.load(i3,k) );
5251 const IntrinsicType b1(
set( B(k,j ) ) );
5252 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5253 xmm1 = xmm1 - a1 * b1;
5254 xmm2 = xmm2 - a2 * b1;
5255 xmm3 = xmm3 - a3 * b1;
5256 xmm4 = xmm4 - a4 * b1;
5257 xmm5 = xmm5 - a1 * b2;
5258 xmm6 = xmm6 - a2 * b2;
5259 xmm7 = xmm7 - a3 * b2;
5260 xmm8 = xmm8 - a4 * b2;
5263 (~C).
store( i , j , xmm1 );
5264 (~C).
store( i1, j , xmm2 );
5265 (~C).
store( i2, j , xmm3 );
5266 (~C).
store( i3, j , xmm4 );
5267 (~C).
store( i , j+1UL, xmm5 );
5268 (~C).
store( i1, j+1UL, xmm6 );
5269 (~C).
store( i2, j+1UL, xmm7 );
5270 (~C).
store( i3, j+1UL, xmm8 );
5275 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5276 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5277 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5278 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5280 IntrinsicType xmm1( (~C).
load(i ,j) );
5281 IntrinsicType xmm2( (~C).
load(i1,j) );
5282 IntrinsicType xmm3( (~C).
load(i2,j) );
5283 IntrinsicType xmm4( (~C).
load(i3,j) );
5285 for(
size_t k=kbegin; k<kend; ++k ) {
5286 const IntrinsicType b1(
set( B(k,j) ) );
5287 xmm1 = xmm1 - A.load(i ,k) * b1;
5288 xmm2 = xmm2 - A.load(i1,k) * b1;
5289 xmm3 = xmm3 - A.load(i2,k) * b1;
5290 xmm4 = xmm4 - A.load(i3,k) * b1;
5293 (~C).
store( i , j, xmm1 );
5294 (~C).
store( i1, j, xmm2 );
5295 (~C).
store( i2, j, xmm3 );
5296 (~C).
store( i3, j, xmm4 );
5306 for( ; (j+4UL) <= jend; j+=4UL )
5308 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5309 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5310 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5311 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5313 IntrinsicType xmm1( (~C).
load(i ,j ) );
5314 IntrinsicType xmm2( (~C).
load(i1,j ) );
5315 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
5316 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
5317 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
5318 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
5319 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
5320 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
5322 for(
size_t k=kbegin; k<kend; ++k ) {
5323 const IntrinsicType a1( A.load(i ,k) );
5324 const IntrinsicType a2( A.load(i1,k) );
5325 const IntrinsicType b1(
set( B(k,j ) ) );
5326 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5327 const IntrinsicType b3(
set( B(k,j+2UL) ) );
5328 const IntrinsicType b4(
set( B(k,j+3UL) ) );
5329 xmm1 = xmm1 - a1 * b1;
5330 xmm2 = xmm2 - a2 * b1;
5331 xmm3 = xmm3 - a1 * b2;
5332 xmm4 = xmm4 - a2 * b2;
5333 xmm5 = xmm5 - a1 * b3;
5334 xmm6 = xmm6 - a2 * b3;
5335 xmm7 = xmm7 - a1 * b4;
5336 xmm8 = xmm8 - a2 * b4;
5339 (~C).
store( i , j , xmm1 );
5340 (~C).
store( i1, j , xmm2 );
5341 (~C).
store( i , j+1UL, xmm3 );
5342 (~C).
store( i1, j+1UL, xmm4 );
5343 (~C).
store( i , j+2UL, xmm5 );
5344 (~C).
store( i1, j+2UL, xmm6 );
5345 (~C).
store( i , j+3UL, xmm7 );
5346 (~C).
store( i1, j+3UL, xmm8 );
5349 for( ; (j+2UL) <= jend; j+=2UL )
5351 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5352 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5353 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5354 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5356 IntrinsicType xmm1( (~C).
load(i ,j ) );
5357 IntrinsicType xmm2( (~C).
load(i1,j ) );
5358 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
5359 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
5361 for(
size_t k=kbegin; k<kend; ++k ) {
5362 const IntrinsicType a1( A.load(i ,k) );
5363 const IntrinsicType a2( A.load(i1,k) );
5364 const IntrinsicType b1(
set( B(k,j ) ) );
5365 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5366 xmm1 = xmm1 - a1 * b1;
5367 xmm2 = xmm2 - a2 * b1;
5368 xmm3 = xmm3 - a1 * b2;
5369 xmm4 = xmm4 - a2 * b2;
5372 (~C).
store( i , j , xmm1 );
5373 (~C).
store( i1, j , xmm2 );
5374 (~C).
store( i , j+1UL, xmm3 );
5375 (~C).
store( i1, j+1UL, xmm4 );
5380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5382 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5383 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5385 IntrinsicType xmm1( (~C).
load(i ,j) );
5386 IntrinsicType xmm2( (~C).
load(i1,j) );
5388 for(
size_t k=kbegin; k<kend; ++k ) {
5389 const IntrinsicType b1(
set( B(k,j) ) );
5390 xmm1 = xmm1 - A.load(i ,k) * b1;
5391 xmm2 = xmm2 - A.load(i1,k) * b1;
5394 (~C).
store( i , j, xmm1 );
5395 (~C).
store( i1, j, xmm2 );
5401 for(
size_t j=jj; j<jend; ++j )
5403 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5404 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5405 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
5406 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5408 IntrinsicType xmm1( (~C).
load(i,j) );
5410 for(
size_t k=kbegin; k<kend; ++k ) {
5411 const IntrinsicType b1(
set( B(k,j) ) );
5412 xmm1 = xmm1 - A.load(i,k) * b1;
5415 (~C).
store( i, j, xmm1 );
5439 template<
typename MT3
5442 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5> >::Type
5443 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5445 selectLargeSubAssignKernel( C, A, B );
5465 template<
typename MT3
5468 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5> >::Type
5469 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5471 if( IsTriangular<MT4>::value ) {
5473 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0F );
5476 else if( IsTriangular<MT5>::value ) {
5478 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0F );
5482 sgemm( C, A, B, -1.0F, 1.0F );
5504 template<
typename MT3
5507 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5> >::Type
5508 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5510 if( IsTriangular<MT4>::value ) {
5512 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), 1.0 );
5515 else if( IsTriangular<MT5>::value ) {
5517 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), 1.0 );
5521 dgemm( C, A, B, -1.0, 1.0 );
5543 template<
typename MT3
5546 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5547 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5549 if( IsTriangular<MT4>::value ) {
5551 ctrmm( tmp, A, CblasLeft,
5552 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5553 complex<float>( 1.0F, 0.0F ) );
5556 else if( IsTriangular<MT5>::value ) {
5558 ctrmm( tmp, B, CblasRight,
5559 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5560 complex<float>( 1.0F, 0.0F ) );
5564 cgemm( C, A, B, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
5586 template<
typename MT3
5589 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
5590 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5592 if( IsTriangular<MT4>::value ) {
5594 ztrmm( tmp, A, CblasLeft,
5595 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
5596 complex<float>( 1.0, 0.0 ) );
5599 else if( IsTriangular<MT5>::value ) {
5601 ztrmm( tmp, B, CblasRight,
5602 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
5603 complex<float>( 1.0, 0.0 ) );
5607 zgemm( C, A, B, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
5642 template<
typename MT
5644 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5652 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5655 else if( rhs.lhs_.columns() == 0UL ) {
5691 template<
typename MT
5693 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5698 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5710 const TmpType tmp( rhs );
5732 template<
typename MT
5734 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5742 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5781 template<
typename MT
5783 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
5791 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5851 template<
typename MT1
5855 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5856 ,
private MatScalarMultExpr
5857 ,
private Computation
5861 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5873 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5878 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5886 template<
typename T1,
typename T2,
typename T3 >
5887 struct IsEvaluationRequired {
5888 enum { value = ( evaluateLeft || evaluateRight ) };
5897 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5898 struct UseSinglePrecisionKernel {
5900 HasMutableDataAccess<T1>::value &&
5901 HasConstDataAccess<T2>::value &&
5902 HasConstDataAccess<T3>::value &&
5903 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5904 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5905 IsFloat<typename T1::ElementType>::value &&
5906 IsFloat<typename T2::ElementType>::value &&
5907 IsFloat<typename T3::ElementType>::value &&
5908 !IsComplex<T4>::value };
5917 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5918 struct UseDoublePrecisionKernel {
5920 HasMutableDataAccess<T1>::value &&
5921 HasConstDataAccess<T2>::value &&
5922 HasConstDataAccess<T3>::value &&
5923 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5924 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5925 IsDouble<typename T1::ElementType>::value &&
5926 IsDouble<typename T2::ElementType>::value &&
5927 IsDouble<typename T3::ElementType>::value &&
5928 !IsComplex<T4>::value };
5937 template<
typename T1,
typename T2,
typename T3 >
5938 struct UseSinglePrecisionComplexKernel {
5939 typedef complex<float> Type;
5941 HasMutableDataAccess<T1>::value &&
5942 HasConstDataAccess<T2>::value &&
5943 HasConstDataAccess<T3>::value &&
5944 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5945 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5946 IsSame<typename T1::ElementType,Type>::value &&
5947 IsSame<typename T2::ElementType,Type>::value &&
5948 IsSame<typename T3::ElementType,Type>::value };
5957 template<
typename T1,
typename T2,
typename T3 >
5958 struct UseDoublePrecisionComplexKernel {
5959 typedef complex<double> Type;
5961 HasMutableDataAccess<T1>::value &&
5962 HasConstDataAccess<T2>::value &&
5963 HasConstDataAccess<T3>::value &&
5964 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5965 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5966 IsSame<typename T1::ElementType,Type>::value &&
5967 IsSame<typename T2::ElementType,Type>::value &&
5968 IsSame<typename T3::ElementType,Type>::value };
5976 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5977 struct UseDefaultKernel {
5978 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
5979 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
5980 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
5981 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
5989 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5990 struct UseVectorizedDefaultKernel {
5991 enum { value = !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5992 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5993 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5994 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
5995 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
5996 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
5997 IsSame<typename T1::ElementType,T4>::value &&
5998 IntrinsicTrait<typename T1::ElementType>::addition &&
5999 IntrinsicTrait<typename T1::ElementType>::subtraction &&
6000 IntrinsicTrait<typename T1::ElementType>::multiplication };
6006 typedef DMatScalarMultExpr<MMM,ST,true>
This;
6007 typedef typename MultTrait<RES,ST>::Type
ResultType;
6011 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
6016 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
6022 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
6025 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
6030 enum { vectorizable = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
6031 MT1::vectorizable && MT2::vectorizable &&
6032 IsSame<ET1,ET2>::value &&
6033 IsSame<ET1,ST>::value &&
6034 IntrinsicTrait<ET1>::addition &&
6035 IntrinsicTrait<ET1>::multiplication };
6038 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
6039 !evaluateRight && MT2::smpAssignable };
6048 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
6061 inline ResultType
operator()(
size_t i,
size_t j )
const {
6064 return matrix_(i,j) * scalar_;
6073 inline size_t rows()
const {
6074 return matrix_.rows();
6083 inline size_t columns()
const {
6084 return matrix_.columns();
6114 template<
typename T >
6115 inline bool canAlias(
const T* alias )
const {
6116 return matrix_.canAlias( alias );
6126 template<
typename T >
6127 inline bool isAliased(
const T* alias )
const {
6128 return matrix_.isAliased( alias );
6138 return matrix_.isAligned();
6148 typename MMM::RightOperand B( matrix_.rightOperand() );
6157 LeftOperand matrix_;
6158 RightOperand scalar_;
6173 template<
typename MT
6175 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6182 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6183 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6185 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6188 else if( left.columns() == 0UL ) {
6203 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6218 template<
typename MT3
6222 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6224 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6226 selectSmallAssignKernel( C, A, B, scalar );
6228 selectBlasAssignKernel( C, A, B, scalar );
6246 template<
typename MT3
6250 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6251 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6253 const size_t M( A.rows() );
6254 const size_t N( B.columns() );
6255 const size_t K( A.columns() );
6257 for(
size_t i=0UL; i<M; ++i )
6259 const size_t kbegin( ( IsUpper<MT4>::value )
6260 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6262 const size_t kend( ( IsLower<MT4>::value )
6263 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6267 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6268 for(
size_t j=0UL; j<N; ++j ) {
6275 const size_t jbegin( ( IsUpper<MT5>::value )
6276 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6278 const size_t jend( ( IsLower<MT5>::value )
6279 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6283 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6284 for(
size_t j=0UL; j<jbegin; ++j ) {
6288 else if( IsStrictlyUpper<MT5>::value ) {
6289 reset( (~C)(i,0UL) );
6291 for(
size_t j=jbegin; j<jend; ++j ) {
6292 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6294 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6295 for(
size_t j=jend; j<N; ++j ) {
6299 else if( IsStrictlyLower<MT5>::value ) {
6300 reset( (~C)(i,N-1UL) );
6304 for(
size_t k=kbegin+1UL; k<kend; ++k )
6306 const size_t jbegin( ( IsUpper<MT5>::value )
6307 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6309 const size_t jend( ( IsLower<MT5>::value )
6310 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6314 for(
size_t j=jbegin; j<jend; ++j ) {
6315 (~C)(i,j) += A(i,k) * B(k,j);
6317 if( IsLower<MT5>::value ) {
6318 (~C)(i,jend) = A(i,k) * B(k,jend);
6323 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6324 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6326 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6327 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6331 for(
size_t j=jbegin; j<jend; ++j ) {
6332 (~C)(i,j) *= scalar;
6353 template<
typename MT3
6357 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6358 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6360 const size_t M( A.rows() );
6361 const size_t N( B.columns() );
6362 const size_t K( A.columns() );
6364 for(
size_t j=0UL; j<N; ++j )
6366 const size_t kbegin( ( IsLower<MT5>::value )
6367 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6369 const size_t kend( ( IsUpper<MT5>::value )
6370 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6374 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6375 for(
size_t i=0UL; i<M; ++i ) {
6382 const size_t ibegin( ( IsLower<MT4>::value )
6383 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6385 const size_t iend( ( IsUpper<MT4>::value )
6386 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6390 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6391 for(
size_t i=0UL; i<ibegin; ++i ) {
6395 else if( IsStrictlyLower<MT4>::value ) {
6396 reset( (~C)(0UL,j) );
6398 for(
size_t i=ibegin; i<iend; ++i ) {
6399 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6401 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6402 for(
size_t i=iend; i<M; ++i ) {
6406 else if( IsStrictlyUpper<MT4>::value ) {
6407 reset( (~C)(M-1UL,j) );
6411 for(
size_t k=kbegin+1UL; k<kend; ++k )
6413 const size_t ibegin( ( IsLower<MT4>::value )
6414 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6416 const size_t iend( ( IsUpper<MT4>::value )
6417 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6421 for(
size_t i=ibegin; i<iend; ++i ) {
6422 (~C)(i,j) += A(i,k) * B(k,j);
6424 if( IsUpper<MT4>::value ) {
6425 (~C)(iend,j) = A(iend,k) * B(k,j);
6430 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6431 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6433 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6434 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6438 for(
size_t i=ibegin; i<iend; ++i ) {
6439 (~C)(i,j) *= scalar;
6460 template<
typename MT3
6464 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6465 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6467 const size_t M( A.rows() );
6468 const size_t N( B.columns() );
6470 const size_t block( 16UL );
6472 for(
size_t ii=0UL; ii<M; ii+=block ) {
6473 const size_t iend(
min( M, ii+block ) );
6474 for(
size_t jj=0UL; jj<N; jj+=block ) {
6475 const size_t jend(
min( N, jj+block ) );
6476 for(
size_t i=ii; i<iend; ++i )
6478 const size_t jbegin( ( IsUpper<MT4>::value )
6479 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6481 const size_t jpos( ( IsLower<MT4>::value )
6482 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6485 if( IsUpper<MT4>::value ) {
6486 for(
size_t j=jj; j<jbegin; ++j ) {
6490 for(
size_t j=jbegin; j<jpos; ++j ) {
6491 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6493 if( IsLower<MT4>::value ) {
6494 for(
size_t j=jpos; j<jend; ++j ) {
6518 template<
typename MT3
6522 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6523 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6525 const size_t M( A.rows() );
6526 const size_t N( B.columns() );
6528 for(
size_t j=0UL; j<N; ++j )
6530 const size_t ibegin( ( IsLower<MT4>::value )
6531 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6533 const size_t iend( ( IsUpper<MT4>::value )
6534 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6538 if( IsLower<MT4>::value ) {
6539 for(
size_t i=0UL; i<ibegin; ++i ) {
6543 for(
size_t i=ibegin; i<iend; ++i ) {
6544 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6546 if( IsUpper<MT4>::value ) {
6547 for(
size_t i=iend; i<M; ++i ) {
6569 template<
typename MT3
6573 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6574 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6576 const size_t M( A.rows() );
6577 const size_t N( B.columns() );
6579 for(
size_t i=0UL; i<M; ++i )
6581 const size_t jbegin( ( IsUpper<MT5>::value )
6582 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6584 const size_t jend( ( IsLower<MT5>::value )
6585 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6589 if( IsUpper<MT5>::value ) {
6590 for(
size_t j=0UL; j<jbegin; ++j ) {
6594 for(
size_t j=jbegin; j<jend; ++j ) {
6595 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6597 if( IsLower<MT5>::value ) {
6598 for(
size_t j=jend; j<N; ++j ) {
6620 template<
typename MT3
6624 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6625 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6627 const size_t M( A.rows() );
6628 const size_t N( B.columns() );
6630 const size_t block( 16UL );
6632 for(
size_t jj=0UL; jj<N; jj+=block ) {
6633 const size_t jend(
min( N, jj+block ) );
6634 for(
size_t ii=0UL; ii<M; ii+=block ) {
6635 const size_t iend(
min( M, ii+block ) );
6636 for(
size_t j=jj; j<jend; ++j )
6638 const size_t ibegin( ( IsLower<MT5>::value )
6639 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6641 const size_t ipos( ( IsUpper<MT5>::value )
6642 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6645 if( IsLower<MT5>::value ) {
6646 for(
size_t i=ii; i<ibegin; ++i ) {
6650 for(
size_t i=ibegin; i<ipos; ++i ) {
6651 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6653 if( IsUpper<MT5>::value ) {
6654 for(
size_t i=ipos; i<iend; ++i ) {
6678 template<
typename MT3
6682 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6683 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6687 for(
size_t i=0UL; i<A.rows(); ++i ) {
6688 C(i,i) = A(i,i) * B(i,i) * scalar;
6707 template<
typename MT3
6711 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6712 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6714 selectDefaultAssignKernel( C, A, B, scalar );
6733 template<
typename MT3
6737 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6738 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6740 typedef IntrinsicTrait<ElementType> IT;
6742 const size_t M( A.rows() );
6743 const size_t N( B.columns() );
6744 const size_t K( A.columns() );
6746 const IntrinsicType factor(
set( scalar ) );
6751 for(
size_t i=0UL; i<M; ++i )
6753 const size_t kbegin( ( IsUpper<MT4>::value )
6754 ?( ( IsLower<MT5>::value )
6755 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6756 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6757 :( IsLower<MT5>::value ? j : 0UL ) );
6758 const size_t kend( ( IsLower<MT4>::value )
6759 ?( ( IsUpper<MT5>::value )
6760 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
6761 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6762 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
6764 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6766 for(
size_t k=kbegin; k<kend; ++k ) {
6767 const IntrinsicType a1(
set( A(i,k) ) );
6768 xmm1 = xmm1 + a1 * B.load(k,j );
6769 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6770 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6771 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6772 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
6773 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
6774 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
6775 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
6778 (~C).
store( i, j , xmm1 * factor );
6793 for( ; (i+2UL) <= M; i+=2UL )
6795 const size_t kbegin( ( IsUpper<MT4>::value )
6796 ?( ( IsLower<MT5>::value )
6797 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6798 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6799 :( IsLower<MT5>::value ? j : 0UL ) );
6800 const size_t kend( ( IsLower<MT4>::value )
6801 ?( ( IsUpper<MT5>::value )
6802 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
6803 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6804 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
6806 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6808 for(
size_t k=kbegin; k<kend; ++k ) {
6809 const IntrinsicType a1(
set( A(i ,k) ) );
6810 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6811 const IntrinsicType b1( B.load(k,j ) );
6812 const IntrinsicType b2( B.load(k,j+
IT::size ) );
6813 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
6814 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
6815 xmm1 = xmm1 + a1 * b1;
6816 xmm2 = xmm2 + a1 * b2;
6817 xmm3 = xmm3 + a1 * b3;
6818 xmm4 = xmm4 + a1 * b4;
6819 xmm5 = xmm5 + a2 * b1;
6820 xmm6 = xmm6 + a2 * b2;
6821 xmm7 = xmm7 + a2 * b3;
6822 xmm8 = xmm8 + a2 * b4;
6825 (~C).
store( i , j , xmm1 * factor );
6829 (~C).
store( i+1UL, j , xmm5 * factor );
6837 const size_t kbegin( ( IsUpper<MT4>::value )
6838 ?( ( IsLower<MT5>::value )
6839 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6840 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6841 :( IsLower<MT5>::value ? j : 0UL ) );
6842 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
6844 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6846 for(
size_t k=kbegin; k<kend; ++k ) {
6847 const IntrinsicType a1(
set( A(i,k) ) );
6848 xmm1 = xmm1 + a1 * B.load(k,j );
6849 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6850 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6851 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6854 (~C).
store( i, j , xmm1 * factor );
6865 for( ; (i+2UL) <= M; i+=2UL )
6867 const size_t kbegin( ( IsUpper<MT4>::value )
6868 ?( ( IsLower<MT5>::value )
6869 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6870 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6871 :( IsLower<MT5>::value ? j : 0UL ) );
6872 const size_t kend( ( IsLower<MT4>::value )
6873 ?( ( IsUpper<MT5>::value )
6874 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
6875 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6876 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
6878 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6880 for(
size_t k=kbegin; k<kend; ++k ) {
6881 const IntrinsicType a1(
set( A(i ,k) ) );
6882 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6883 const IntrinsicType b1( B.load(k,j ) );
6884 const IntrinsicType b2( B.load(k,j+
IT::size) );
6885 xmm1 = xmm1 + a1 * b1;
6886 xmm2 = xmm2 + a1 * b2;
6887 xmm3 = xmm3 + a2 * b1;
6888 xmm4 = xmm4 + a2 * b2;
6891 (~C).
store( i , j , xmm1 * factor );
6893 (~C).
store( i+1UL, j , xmm3 * factor );
6899 const size_t kbegin( ( IsUpper<MT4>::value )
6900 ?( ( IsLower<MT5>::value )
6901 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6902 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6903 :( IsLower<MT5>::value ? j : 0UL ) );
6904 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
6906 IntrinsicType xmm1, xmm2;
6908 for(
size_t k=kbegin; k<kend; ++k ) {
6909 const IntrinsicType a1(
set( A(i,k) ) );
6910 xmm1 = xmm1 + a1 * B.load(k,j );
6911 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
6914 (~C).
store( i, j , xmm1 * factor );
6923 for( ; (i+2UL) <= M; i+=2UL )
6925 const size_t kbegin( ( IsUpper<MT4>::value )
6926 ?( ( IsLower<MT5>::value )
6927 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6928 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6929 :( IsLower<MT5>::value ? j : 0UL ) );
6930 const size_t kend( ( IsLower<MT4>::value )
6931 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6934 IntrinsicType xmm1, xmm2;
6936 for(
size_t k=kbegin; k<kend; ++k ) {
6937 const IntrinsicType b1( B.load(k,j) );
6938 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
6939 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
6942 (~C).
store( i , j, xmm1 * factor );
6943 (~C).
store( i+1UL, j, xmm2 * factor );
6948 const size_t kbegin( ( IsUpper<MT4>::value )
6949 ?( ( IsLower<MT5>::value )
6950 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6951 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6952 :( IsLower<MT5>::value ? j : 0UL ) );
6956 for(
size_t k=kbegin; k<K; ++k ) {
6957 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
6960 (~C).
store( i, j, xmm1 * factor );
6981 template<
typename MT3
6985 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6986 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6988 typedef IntrinsicTrait<ElementType> IT;
6990 const size_t M( A.rows() );
6991 const size_t N( B.columns() );
6992 const size_t K( A.columns() );
6994 const IntrinsicType factor(
set( scalar ) );
6999 for(
size_t j=0UL; j<N; ++j )
7001 const size_t kbegin( ( IsLower<MT5>::value )
7002 ?( ( IsUpper<MT4>::value )
7003 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7004 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7005 :( IsUpper<MT4>::value ? i : 0UL ) );
7006 const size_t kend( ( IsUpper<MT5>::value )
7007 ?( ( IsLower<MT4>::value )
7008 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
7009 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
7010 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
7012 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7014 for(
size_t k=kbegin; k<kend; ++k ) {
7015 const IntrinsicType b1(
set( B(k,j) ) );
7016 xmm1 = xmm1 + A.load(i ,k) * b1;
7017 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7018 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7019 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7020 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
7021 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
7022 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
7023 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
7026 (~C).
store( i , j, xmm1 * factor );
7041 for( ; (j+2UL) <= N; j+=2UL )
7043 const size_t kbegin( ( IsLower<MT5>::value )
7044 ?( ( IsUpper<MT4>::value )
7045 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7046 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7047 :( IsUpper<MT4>::value ? i : 0UL ) );
7048 const size_t kend( ( IsUpper<MT5>::value )
7049 ?( ( IsLower<MT4>::value )
7050 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7051 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7052 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
7054 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7056 for(
size_t k=kbegin; k<kend; ++k ) {
7057 const IntrinsicType a1( A.load(i ,k) );
7058 const IntrinsicType a2( A.load(i+
IT::size ,k) );
7059 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
7060 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
7061 const IntrinsicType b1(
set( B(k,j ) ) );
7062 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7063 xmm1 = xmm1 + a1 * b1;
7064 xmm2 = xmm2 + a2 * b1;
7065 xmm3 = xmm3 + a3 * b1;
7066 xmm4 = xmm4 + a4 * b1;
7067 xmm5 = xmm5 + a1 * b2;
7068 xmm6 = xmm6 + a2 * b2;
7069 xmm7 = xmm7 + a3 * b2;
7070 xmm8 = xmm8 + a4 * b2;
7073 (~C).
store( i , j , xmm1 * factor );
7077 (~C).
store( i , j+1UL, xmm5 * factor );
7085 const size_t kbegin( ( IsLower<MT5>::value )
7086 ?( ( IsUpper<MT4>::value )
7087 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7088 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7089 :( IsUpper<MT4>::value ? i : 0UL ) );
7090 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
7092 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7094 for(
size_t k=kbegin; k<kend; ++k ) {
7095 const IntrinsicType b1(
set( B(k,j) ) );
7096 xmm1 = xmm1 + A.load(i ,k) * b1;
7097 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
7098 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
7099 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
7102 (~C).
store( i , j, xmm1 * factor );
7113 for( ; (j+2UL) <= N; j+=2UL )
7115 const size_t kbegin( ( IsLower<MT5>::value )
7116 ?( ( IsUpper<MT4>::value )
7117 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7118 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7119 :( IsUpper<MT4>::value ? i : 0UL ) );
7120 const size_t kend( ( IsUpper<MT5>::value )
7121 ?( ( IsLower<MT4>::value )
7122 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7123 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7124 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
7126 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7128 for(
size_t k=kbegin; k<kend; ++k ) {
7129 const IntrinsicType a1( A.load(i ,k) );
7130 const IntrinsicType a2( A.load(i+
IT::size,k) );
7131 const IntrinsicType b1(
set( B(k,j ) ) );
7132 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7133 xmm1 = xmm1 + a1 * b1;
7134 xmm2 = xmm2 + a2 * b1;
7135 xmm3 = xmm3 + a1 * b2;
7136 xmm4 = xmm4 + a2 * b2;
7139 (~C).
store( i , j , xmm1 * factor );
7141 (~C).
store( i , j+1UL, xmm3 * factor );
7147 const size_t kbegin( ( IsLower<MT5>::value )
7148 ?( ( IsUpper<MT4>::value )
7149 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7150 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7151 :( IsUpper<MT4>::value ? i : 0UL ) );
7152 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
7154 IntrinsicType xmm1, xmm2;
7156 for(
size_t k=kbegin; k<kend; ++k ) {
7157 const IntrinsicType b1(
set( B(k,j) ) );
7158 xmm1 = xmm1 + A.load(i ,k) * b1;
7159 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
7162 (~C).
store( i , j, xmm1 * factor );
7171 for( ; (j+2UL) <= N; j+=2UL )
7173 const size_t kbegin( ( IsLower<MT5>::value )
7174 ?( ( IsUpper<MT4>::value )
7175 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7176 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7177 :( IsUpper<MT4>::value ? i : 0UL ) );
7178 const size_t kend( ( IsUpper<MT5>::value )
7179 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7182 IntrinsicType xmm1, xmm2;
7184 for(
size_t k=kbegin; k<kend; ++k ) {
7185 const IntrinsicType a1( A.load(i,k) );
7186 xmm1 = xmm1 + a1 *
set( B(k,j ) );
7187 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
7190 (~C).
store( i, j , xmm1 * factor );
7191 (~C).
store( i, j+1UL, xmm2 * factor );
7196 const size_t kbegin( ( IsLower<MT5>::value )
7197 ?( ( IsUpper<MT4>::value )
7198 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7199 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7200 :( IsUpper<MT4>::value ? i : 0UL ) );
7204 for(
size_t k=kbegin; k<K; ++k ) {
7205 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
7208 (~C).
store( i, j, xmm1 * factor );
7228 template<
typename MT3
7232 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7233 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7235 selectDefaultAssignKernel( C, A, B, scalar );
7254 template<
typename MT3
7258 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7259 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7261 typedef IntrinsicTrait<ElementType> IT;
7263 const size_t M( A.rows() );
7264 const size_t N( B.columns() );
7265 const size_t K( A.columns() );
7267 const size_t iblock( 64UL );
7268 const size_t jblock( 128UL );
7269 const size_t kblock( 128UL );
7271 const IntrinsicType factor(
set( scalar ) );
7273 for(
size_t jj=0UL; jj<N; jj+=jblock )
7275 const size_t jend(
min( jj+jblock, N ) );
7277 for(
size_t ii=0UL; ii<M; ii+=iblock )
7279 const size_t iend(
min( ii+iblock, M ) );
7281 for(
size_t i=ii; i<iend; ++i ) {
7282 for(
size_t j=jj; j<jend; ++j ) {
7287 for(
size_t kk=0UL; kk<K; kk+=kblock )
7289 const size_t ktmp(
min( kk+kblock, K ) );
7301 for( ; (i+2UL) <= iend; i+=2UL )
7303 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7304 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7305 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7306 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7308 IntrinsicType xmm1( (~C).
load(i ,j ) );
7309 IntrinsicType xmm2( (~C).
load(i ,j1) );
7310 IntrinsicType xmm3( (~C).
load(i ,j2) );
7311 IntrinsicType xmm4( (~C).
load(i ,j3) );
7312 IntrinsicType xmm5( (~C).
load(i+1UL,j ) );
7313 IntrinsicType xmm6( (~C).
load(i+1UL,j1) );
7314 IntrinsicType xmm7( (~C).
load(i+1UL,j2) );
7315 IntrinsicType xmm8( (~C).
load(i+1UL,j3) );
7317 for(
size_t k=kbegin; k<kend; ++k ) {
7318 const IntrinsicType a1(
set( A(i ,k) ) );
7319 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7320 const IntrinsicType b1( B.load(k,j ) );
7321 const IntrinsicType b2( B.load(k,j1) );
7322 const IntrinsicType b3( B.load(k,j2) );
7323 const IntrinsicType b4( B.load(k,j3) );
7324 xmm1 = xmm1 + a1 * b1;
7325 xmm2 = xmm2 + a1 * b2;
7326 xmm3 = xmm3 + a1 * b3;
7327 xmm4 = xmm4 + a1 * b4;
7328 xmm5 = xmm5 + a2 * b1;
7329 xmm6 = xmm6 + a2 * b2;
7330 xmm7 = xmm7 + a2 * b3;
7331 xmm8 = xmm8 + a2 * b4;
7334 (~C).
store( i , j , xmm1 * factor );
7335 (~C).
store( i , j1, xmm2 * factor );
7336 (~C).
store( i , j2, xmm3 * factor );
7337 (~C).
store( i , j3, xmm4 * factor );
7338 (~C).
store( i+1UL, j , xmm5 * factor );
7339 (~C).
store( i+1UL, j1, xmm6 * factor );
7340 (~C).
store( i+1UL, j2, xmm7 * factor );
7341 (~C).
store( i+1UL, j3, xmm8 * factor );
7346 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7347 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7348 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7349 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7351 IntrinsicType xmm1( (~C).
load(i,j ) );
7352 IntrinsicType xmm2( (~C).
load(i,j1) );
7353 IntrinsicType xmm3( (~C).
load(i,j2) );
7354 IntrinsicType xmm4( (~C).
load(i,j3) );
7356 for(
size_t k=kbegin; k<kend; ++k ) {
7357 const IntrinsicType a1(
set( A(i,k) ) );
7358 xmm1 = xmm1 + a1 * B.load(k,j );
7359 xmm2 = xmm2 + a1 * B.load(k,j1);
7360 xmm3 = xmm3 + a1 * B.load(k,j2);
7361 xmm4 = xmm4 + a1 * B.load(k,j3);
7364 (~C).
store( i, j , xmm1 * factor );
7365 (~C).
store( i, j1, xmm2 * factor );
7366 (~C).
store( i, j2, xmm3 * factor );
7367 (~C).
store( i, j3, xmm4 * factor );
7377 for( ; (i+4UL) <= iend; i+=4UL )
7379 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7380 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7381 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7382 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7384 IntrinsicType xmm1( (~C).
load(i ,j ) );
7385 IntrinsicType xmm2( (~C).
load(i ,j1) );
7386 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
7387 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
7388 IntrinsicType xmm5( (~C).
load(i+2UL,j ) );
7389 IntrinsicType xmm6( (~C).
load(i+2UL,j1) );
7390 IntrinsicType xmm7( (~C).
load(i+3UL,j ) );
7391 IntrinsicType xmm8( (~C).
load(i+3UL,j1) );
7393 for(
size_t k=kbegin; k<kend; ++k ) {
7394 const IntrinsicType a1(
set( A(i ,k) ) );
7395 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7396 const IntrinsicType a3(
set( A(i+2UL,k) ) );
7397 const IntrinsicType a4(
set( A(i+3UL,k) ) );
7398 const IntrinsicType b1( B.load(k,j ) );
7399 const IntrinsicType b2( B.load(k,j1) );
7400 xmm1 = xmm1 + a1 * b1;
7401 xmm2 = xmm2 + a1 * b2;
7402 xmm3 = xmm3 + a2 * b1;
7403 xmm4 = xmm4 + a2 * b2;
7404 xmm5 = xmm5 + a3 * b1;
7405 xmm6 = xmm6 + a3 * b2;
7406 xmm7 = xmm7 + a4 * b1;
7407 xmm8 = xmm8 + a4 * b2;
7410 (~C).
store( i , j , xmm1 * factor );
7411 (~C).
store( i , j1, xmm2 * factor );
7412 (~C).
store( i+1UL, j , xmm3 * factor );
7413 (~C).
store( i+1UL, j1, xmm4 * factor );
7414 (~C).
store( i+2UL, j , xmm5 * factor );
7415 (~C).
store( i+2UL, j1, xmm6 * factor );
7416 (~C).
store( i+3UL, j , xmm7 * factor );
7417 (~C).
store( i+3UL, j1, xmm8 * factor );
7420 for( ; (i+2UL) <= iend; i+=2UL )
7422 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7423 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7424 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7425 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7427 IntrinsicType xmm1( (~C).
load(i ,j ) );
7428 IntrinsicType xmm2( (~C).
load(i ,j1) );
7429 IntrinsicType xmm3( (~C).
load(i+1UL,j ) );
7430 IntrinsicType xmm4( (~C).
load(i+1UL,j1) );
7432 for(
size_t k=kbegin; k<kend; ++k ) {
7433 const IntrinsicType a1(
set( A(i ,k) ) );
7434 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7435 const IntrinsicType b1( B.load(k,j ) );
7436 const IntrinsicType b2( B.load(k,j1) );
7437 xmm1 = xmm1 + a1 * b1;
7438 xmm2 = xmm2 + a1 * b2;
7439 xmm3 = xmm3 + a2 * b1;
7440 xmm4 = xmm4 + a2 * b2;
7443 (~C).
store( i , j , xmm1 * factor );
7444 (~C).
store( i , j1, xmm2 * factor );
7445 (~C).
store( i+1UL, j , xmm3 * factor );
7446 (~C).
store( i+1UL, j1, xmm4 * factor );
7451 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7452 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7453 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7454 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7456 IntrinsicType xmm1( (~C).
load(i,j ) );
7457 IntrinsicType xmm2( (~C).
load(i,j1) );
7459 for(
size_t k=kbegin; k<kend; ++k ) {
7460 const IntrinsicType a1(
set( A(i,k) ) );
7461 xmm1 = xmm1 + a1 * B.load(k,j );
7462 xmm2 = xmm2 + a1 * B.load(k,j1);
7465 (~C).
store( i, j , xmm1 * factor );
7466 (~C).
store( i, j1, xmm2 * factor );
7472 for(
size_t i=ii; i<iend; ++i )
7474 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7475 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7476 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7477 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
7479 IntrinsicType xmm1( (~C).
load(i,j) );
7481 for(
size_t k=kbegin; k<kend; ++k ) {
7482 const IntrinsicType a1(
set( A(i,k) ) );
7483 xmm1 = xmm1 + a1 * B.load(k,j);
7486 (~C).
store( i, j, xmm1 * factor );
7510 template<
typename MT3
7514 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7515 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7517 typedef IntrinsicTrait<ElementType> IT;
7519 const size_t M( A.rows() );
7520 const size_t N( B.columns() );
7521 const size_t K( A.columns() );
7523 const size_t iblock( 128UL );
7524 const size_t jblock( 64UL );
7525 const size_t kblock( 128UL );
7527 const IntrinsicType factor(
set( scalar ) );
7529 for(
size_t ii=0UL; ii<M; ii+=iblock )
7531 const size_t iend(
min( ii+iblock, M ) );
7533 for(
size_t jj=0UL; jj<N; jj+=jblock )
7535 const size_t jend(
min( jj+jblock, N ) );
7537 for(
size_t j=jj; j<jend; ++j ) {
7538 for(
size_t i=ii; i<iend; ++i ) {
7543 for(
size_t kk=0UL; kk<K; kk+=kblock )
7545 const size_t ktmp(
min( kk+kblock, K ) );
7557 for( ; (j+2UL) <= jend; j+=2UL )
7559 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7560 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7561 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7562 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7564 IntrinsicType xmm1( (~C).
load(i ,j ) );
7565 IntrinsicType xmm2( (~C).
load(i1,j ) );
7566 IntrinsicType xmm3( (~C).
load(i2,j ) );
7567 IntrinsicType xmm4( (~C).
load(i3,j ) );
7568 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
7569 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
7570 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
7571 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
7573 for(
size_t k=kbegin; k<kend; ++k ) {
7574 const IntrinsicType a1( A.load(i ,k) );
7575 const IntrinsicType a2( A.load(i1,k) );
7576 const IntrinsicType a3( A.load(i2,k) );
7577 const IntrinsicType a4( A.load(i3,k) );
7578 const IntrinsicType b1(
set( B(k,j ) ) );
7579 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7580 xmm1 = xmm1 + a1 * b1;
7581 xmm2 = xmm2 + a2 * b1;
7582 xmm3 = xmm3 + a3 * b1;
7583 xmm4 = xmm4 + a4 * b1;
7584 xmm5 = xmm5 + a1 * b2;
7585 xmm6 = xmm6 + a2 * b2;
7586 xmm7 = xmm7 + a3 * b2;
7587 xmm8 = xmm8 + a4 * b2;
7590 (~C).
store( i , j , xmm1 * factor );
7591 (~C).
store( i1, j , xmm2 * factor );
7592 (~C).
store( i2, j , xmm3 * factor );
7593 (~C).
store( i3, j , xmm4 * factor );
7594 (~C).
store( i , j+1UL, xmm5 * factor );
7595 (~C).
store( i1, j+1UL, xmm6 * factor );
7596 (~C).
store( i2, j+1UL, xmm7 * factor );
7597 (~C).
store( i3, j+1UL, xmm8 * factor );
7602 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7603 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7604 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7605 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7607 IntrinsicType xmm1( (~C).
load(i ,j) );
7608 IntrinsicType xmm2( (~C).
load(i1,j) );
7609 IntrinsicType xmm3( (~C).
load(i2,j) );
7610 IntrinsicType xmm4( (~C).
load(i3,j) );
7612 for(
size_t k=kbegin; k<kend; ++k ) {
7613 const IntrinsicType b1(
set( B(k,j) ) );
7614 xmm1 = xmm1 + A.load(i ,k) * b1;
7615 xmm2 = xmm2 + A.load(i1,k) * b1;
7616 xmm3 = xmm3 + A.load(i2,k) * b1;
7617 xmm4 = xmm4 + A.load(i3,k) * b1;
7620 (~C).
store( i , j, xmm1 * factor );
7621 (~C).
store( i1, j, xmm2 * factor );
7622 (~C).
store( i2, j, xmm3 * factor );
7623 (~C).
store( i3, j, xmm4 * factor );
7633 for( ; (j+4UL) <= jend; j+=4UL )
7635 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7636 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7637 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7638 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7640 IntrinsicType xmm1( (~C).
load(i ,j ) );
7641 IntrinsicType xmm2( (~C).
load(i1,j ) );
7642 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
7643 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
7644 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
7645 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
7646 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
7647 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
7649 for(
size_t k=kbegin; k<kend; ++k ) {
7650 const IntrinsicType a1( A.load(i ,k) );
7651 const IntrinsicType a2( A.load(i1,k) );
7652 const IntrinsicType b1(
set( B(k,j ) ) );
7653 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7654 const IntrinsicType b3(
set( B(k,j+2UL) ) );
7655 const IntrinsicType b4(
set( B(k,j+3UL) ) );
7656 xmm1 = xmm1 + a1 * b1;
7657 xmm2 = xmm2 + a2 * b1;
7658 xmm3 = xmm3 + a1 * b2;
7659 xmm4 = xmm4 + a2 * b2;
7660 xmm5 = xmm5 + a1 * b3;
7661 xmm6 = xmm6 + a2 * b3;
7662 xmm7 = xmm7 + a1 * b4;
7663 xmm8 = xmm8 + a2 * b4;
7666 (~C).
store( i , j , xmm1 * factor );
7667 (~C).
store( i1, j , xmm2 * factor );
7668 (~C).
store( i , j+1UL, xmm3 * factor );
7669 (~C).
store( i1, j+1UL, xmm4 * factor );
7670 (~C).
store( i , j+2UL, xmm5 * factor );
7671 (~C).
store( i1, j+2UL, xmm6 * factor );
7672 (~C).
store( i , j+3UL, xmm7 * factor );
7673 (~C).
store( i1, j+3UL, xmm8 * factor );
7676 for( ; (j+2UL) <= jend; j+=2UL )
7678 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7679 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7680 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7681 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7683 IntrinsicType xmm1( (~C).
load(i ,j ) );
7684 IntrinsicType xmm2( (~C).
load(i1,j ) );
7685 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
7686 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
7688 for(
size_t k=kbegin; k<kend; ++k ) {
7689 const IntrinsicType a1( A.load(i ,k) );
7690 const IntrinsicType a2( A.load(i1,k) );
7691 const IntrinsicType b1(
set( B(k,j ) ) );
7692 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7693 xmm1 = xmm1 + a1 * b1;
7694 xmm2 = xmm2 + a2 * b1;
7695 xmm3 = xmm3 + a1 * b2;
7696 xmm4 = xmm4 + a2 * b2;
7699 (~C).
store( i , j , xmm1 * factor );
7700 (~C).
store( i1, j , xmm2 * factor );
7701 (~C).
store( i , j+1UL, xmm3 * factor );
7702 (~C).
store( i1, j+1UL, xmm4 * factor );
7707 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7708 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7709 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7710 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7712 IntrinsicType xmm1( (~C).
load(i ,j) );
7713 IntrinsicType xmm2( (~C).
load(i1,j) );
7715 for(
size_t k=kbegin; k<kend; ++k ) {
7716 const IntrinsicType b1(
set( B(k,j) ) );
7717 xmm1 = xmm1 + A.load(i ,k) * b1;
7718 xmm2 = xmm2 + A.load(i1,k) * b1;
7721 (~C).
store( i , j, xmm1 * factor );
7722 (~C).
store( i1, j, xmm2 * factor );
7728 for(
size_t j=jj; j<jend; ++j )
7730 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7731 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7732 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
7733 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7735 IntrinsicType xmm1( (~C).
load(i,j) );
7737 for(
size_t k=kbegin; k<kend; ++k ) {
7738 const IntrinsicType b1(
set( B(k,j) ) );
7739 xmm1 = xmm1 + A.load(i,k) * b1;
7742 (~C).
store( i, j, xmm1 * factor );
7765 template<
typename MT3
7769 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7770 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7772 selectLargeAssignKernel( C, A, B, scalar );
7791 template<
typename MT3
7795 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7796 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7798 if( IsTriangular<MT4>::value ) {
7800 strmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7802 else if( IsTriangular<MT5>::value ) {
7804 strmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7807 sgemm( C, A, B, scalar, 0.0F );
7828 template<
typename MT3
7832 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
7833 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7835 if( IsTriangular<MT4>::value ) {
7837 dtrmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
7839 else if( IsTriangular<MT5>::value ) {
7841 dtrmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
7844 dgemm( C, A, B, scalar, 0.0 );
7865 template<
typename MT3
7869 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7870 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7872 if( IsTriangular<MT4>::value ) {
7874 ctrmm( C, A, CblasLeft,
7875 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7876 complex<float>( scalar, 0.0F ) );
7878 else if( IsTriangular<MT5>::value ) {
7880 ctrmm( C, B, CblasRight,
7881 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7882 complex<float>( scalar, 0.0F ) );
7885 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
7906 template<
typename MT3
7910 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
7911 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7913 if( IsTriangular<MT4>::value ) {
7915 ztrmm( C, A, CblasLeft,
7916 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
7917 complex<double>( scalar, 0.0 ) );
7919 else if( IsTriangular<MT5>::value ) {
7921 ztrmm( C, B, CblasRight,
7922 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
7923 complex<double>( scalar, 0.0 ) );
7926 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
7944 template<
typename MT
7946 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7950 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7962 const TmpType tmp(
serial( rhs ) );
7979 template<
typename MT
7981 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7988 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7989 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7991 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8005 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
8020 template<
typename MT3
8024 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8026 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
8028 selectSmallAddAssignKernel( C, A, B, scalar );
8030 selectBlasAddAssignKernel( C, A, B, scalar );
8048 template<
typename MT3
8052 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
8053 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8055 const ResultType tmp(
serial( A * B * scalar ) );
8074 template<
typename MT3
8078 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8079 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8081 const size_t M( A.rows() );
8082 const size_t N( B.columns() );
8084 const size_t block( 16UL );
8086 for(
size_t ii=0UL; ii<M; ii+=block ) {
8087 const size_t iend(
min( M, ii+block ) );
8088 for(
size_t jj=0UL; jj<N; jj+=block ) {
8089 const size_t jend(
min( N, jj+block ) );
8090 for(
size_t i=ii; i<iend; ++i )
8092 const size_t jbegin( ( IsUpper<MT4>::value )
8093 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8095 const size_t jpos( ( IsLower<MT4>::value )
8096 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8099 for(
size_t j=jbegin; j<jpos; ++j ) {
8100 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8122 template<
typename MT3
8126 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
8127 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8129 const size_t M( A.rows() );
8130 const size_t N( B.columns() );
8132 for(
size_t j=0UL; j<N; ++j )
8134 const size_t ibegin( ( IsLower<MT4>::value )
8135 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8137 const size_t iend( ( IsUpper<MT4>::value )
8138 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8142 const size_t inum( iend - ibegin );
8143 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
8145 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
8146 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8147 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8150 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8170 template<
typename MT3
8174 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8175 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8177 const size_t M( A.rows() );
8178 const size_t N( B.columns() );
8180 for(
size_t i=0UL; i<M; ++i )
8182 const size_t jbegin( ( IsUpper<MT5>::value )
8183 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8185 const size_t jend( ( IsLower<MT5>::value )
8186 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8190 const size_t jnum( jend - jbegin );
8191 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
8193 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
8194 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8195 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8198 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8218 template<
typename MT3
8222 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
8223 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8225 const size_t M( A.rows() );
8226 const size_t N( B.columns() );
8228 const size_t block( 16UL );
8230 for(
size_t jj=0UL; jj<N; jj+=block ) {
8231 const size_t jend(
min( N, jj+block ) );
8232 for(
size_t ii=0UL; ii<M; ii+=block ) {
8233 const size_t iend(
min( M, ii+block ) );
8234 for(
size_t j=jj; j<jend; ++j )
8236 const size_t ibegin( ( IsLower<MT5>::value )
8237 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8239 const size_t ipos( ( IsUpper<MT5>::value )
8240 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8243 for(
size_t i=ibegin; i<ipos; ++i ) {
8244 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8266 template<
typename MT3
8270 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
8271 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8273 for(
size_t i=0UL; i<A.rows(); ++i ) {
8274 C(i,i) += A(i,i) * B(i,i) * scalar;
8293 template<
typename MT3
8297 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8298 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8300 selectDefaultAddAssignKernel( C, A, B, scalar );
8319 template<
typename MT3
8323 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8324 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8326 typedef IntrinsicTrait<ElementType> IT;
8328 const size_t M( A.rows() );
8329 const size_t N( B.columns() );
8330 const size_t K( A.columns() );
8332 const IntrinsicType factor(
set( scalar ) );
8337 for(
size_t i=0UL; i<M; ++i )
8339 const size_t kbegin( ( IsUpper<MT4>::value )
8340 ?( ( IsLower<MT5>::value )
8341 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8342 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8343 :( IsLower<MT5>::value ? j : 0UL ) );
8344 const size_t kend( ( IsLower<MT4>::value )
8345 ?( ( IsUpper<MT5>::value )
8346 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
8347 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8348 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
8350 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8352 for(
size_t k=kbegin; k<kend; ++k ) {
8353 const IntrinsicType a1(
set( A(i,k) ) );
8354 xmm1 = xmm1 + a1 * B.load(k,j );
8355 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
8356 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
8357 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
8358 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
8359 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
8360 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
8361 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
8364 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
8379 for( ; (i+2UL) <= M; i+=2UL )
8381 const size_t kbegin( ( IsUpper<MT4>::value )
8382 ?( ( IsLower<MT5>::value )
8383 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8384 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8385 :( IsLower<MT5>::value ? j : 0UL ) );
8386 const size_t kend( ( IsLower<MT4>::value )
8387 ?( ( IsUpper<MT5>::value )
8388 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
8389 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8390 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
8392 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8394 for(
size_t k=kbegin; k<kend; ++k ) {
8395 const IntrinsicType a1(
set( A(i ,k) ) );
8396 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8397 const IntrinsicType b1( B.load(k,j ) );
8398 const IntrinsicType b2( B.load(k,j+
IT::size ) );
8399 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
8400 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
8401 xmm1 = xmm1 + a1 * b1;
8402 xmm2 = xmm2 + a1 * b2;
8403 xmm3 = xmm3 + a1 * b3;
8404 xmm4 = xmm4 + a1 * b4;
8405 xmm5 = xmm5 + a2 * b1;
8406 xmm6 = xmm6 + a2 * b2;
8407 xmm7 = xmm7 + a2 * b3;
8408 xmm8 = xmm8 + a2 * b4;
8411 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8415 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8423 const size_t kbegin( ( IsUpper<MT4>::value )
8424 ?( ( IsLower<MT5>::value )
8425 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8426 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8427 :( IsLower<MT5>::value ? j : 0UL ) );
8428 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
8430 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8432 for(
size_t k=kbegin; k<kend; ++k ) {
8433 const IntrinsicType a1(
set( A(i,k) ) );
8434 xmm1 = xmm1 + a1 * B.load(k,j );
8435 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
8436 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
8437 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
8440 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
8451 for( ; (i+2UL) <= M; i+=2UL )
8453 const size_t kbegin( ( IsUpper<MT4>::value )
8454 ?( ( IsLower<MT5>::value )
8455 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8456 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8457 :( IsLower<MT5>::value ? j : 0UL ) );
8458 const size_t kend( ( IsLower<MT4>::value )
8459 ?( ( IsUpper<MT5>::value )
8460 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
8461 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8462 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
8464 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8466 for(
size_t k=kbegin; k<kend; ++k ) {
8467 const IntrinsicType a1(
set( A(i ,k) ) );
8468 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8469 const IntrinsicType b1( B.load(k,j ) );
8470 const IntrinsicType b2( B.load(k,j+
IT::size) );
8471 xmm1 = xmm1 + a1 * b1;
8472 xmm2 = xmm2 + a1 * b2;
8473 xmm3 = xmm3 + a2 * b1;
8474 xmm4 = xmm4 + a2 * b2;
8477 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8479 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8485 const size_t kbegin( ( IsUpper<MT4>::value )
8486 ?( ( IsLower<MT5>::value )
8487 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8488 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8489 :( IsLower<MT5>::value ? j : 0UL ) );
8490 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
8492 IntrinsicType xmm1, xmm2;
8494 for(
size_t k=kbegin; k<kend; ++k ) {
8495 const IntrinsicType a1(
set( A(i,k) ) );
8496 xmm1 = xmm1 + a1 * B.load(k,j );
8497 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
8500 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
8509 for( ; (i+2UL) <= M; i+=2UL )
8511 const size_t kbegin( ( IsUpper<MT4>::value )
8512 ?( ( IsLower<MT5>::value )
8513 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8514 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8515 :( IsLower<MT5>::value ? j : 0UL ) );
8516 const size_t kend( ( IsLower<MT4>::value )
8517 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8520 IntrinsicType xmm1, xmm2;
8522 for(
size_t k=kbegin; k<kend; ++k ) {
8523 const IntrinsicType b1( B.load(k,j) );
8524 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
8525 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
8528 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
8529 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8534 const size_t kbegin( ( IsUpper<MT4>::value )
8535 ?( ( IsLower<MT5>::value )
8536 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8537 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8538 :( IsLower<MT5>::value ? j : 0UL ) );
8542 for(
size_t k=kbegin; k<K; ++k ) {
8543 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
8546 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
8567 template<
typename MT3
8571 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8572 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8574 typedef IntrinsicTrait<ElementType> IT;
8576 const size_t M( A.rows() );
8577 const size_t N( B.columns() );
8578 const size_t K( A.columns() );
8580 const IntrinsicType factor(
set( scalar ) );
8585 for(
size_t j=0UL; j<N; ++j )
8587 const size_t kbegin( ( IsLower<MT5>::value )
8588 ?( ( IsUpper<MT4>::value )
8589 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8590 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8591 :( IsUpper<MT4>::value ? i : 0UL ) );
8592 const size_t kend( ( IsUpper<MT5>::value )
8593 ?( ( IsLower<MT4>::value )
8594 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8595 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8596 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
8598 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8600 for(
size_t k=kbegin; k<kend; ++k ) {
8601 const IntrinsicType b1(
set( B(k,j) ) );
8602 xmm1 = xmm1 + A.load(i ,k) * b1;
8603 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
8604 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
8605 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
8606 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
8607 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
8608 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
8609 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
8612 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
8627 for( ; (j+2UL) <= N; j+=2UL )
8629 const size_t kbegin( ( IsLower<MT5>::value )
8630 ?( ( IsUpper<MT4>::value )
8631 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8632 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8633 :( IsUpper<MT4>::value ? i : 0UL ) );
8634 const size_t kend( ( IsUpper<MT5>::value )
8635 ?( ( IsLower<MT4>::value )
8636 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8637 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8638 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
8640 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8642 for(
size_t k=kbegin; k<kend; ++k ) {
8643 const IntrinsicType a1( A.load(i ,k) );
8644 const IntrinsicType a2( A.load(i+
IT::size ,k) );
8645 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
8646 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
8647 const IntrinsicType b1(
set( B(k,j ) ) );
8648 const IntrinsicType b2(
set( B(k,j+1UL) ) );
8649 xmm1 = xmm1 + a1 * b1;
8650 xmm2 = xmm2 + a2 * b1;
8651 xmm3 = xmm3 + a3 * b1;
8652 xmm4 = xmm4 + a4 * b1;
8653 xmm5 = xmm5 + a1 * b2;
8654 xmm6 = xmm6 + a2 * b2;
8655 xmm7 = xmm7 + a3 * b2;
8656 xmm8 = xmm8 + a4 * b2;
8659 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8663 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8671 const size_t kbegin( ( IsLower<MT5>::value )
8672 ?( ( IsUpper<MT4>::value )
8673 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8674 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8675 :( IsUpper<MT4>::value ? i : 0UL ) );
8676 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
8678 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8680 for(
size_t k=kbegin; k<kend; ++k ) {
8681 const IntrinsicType b1(
set( B(k,j) ) );
8682 xmm1 = xmm1 + A.load(i ,k) * b1;
8683 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
8684 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
8685 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
8688 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
8699 for( ; (j+2UL) <= N; j+=2UL )
8701 const size_t kbegin( ( IsLower<MT5>::value )
8702 ?( ( IsUpper<MT4>::value )
8703 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8704 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8705 :( IsUpper<MT4>::value ? i : 0UL ) );
8706 const size_t kend( ( IsUpper<MT5>::value )
8707 ?( ( IsLower<MT4>::value )
8708 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8709 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8710 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
8712 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8714 for(
size_t k=kbegin; k<kend; ++k ) {
8715 const IntrinsicType a1( A.load(i ,k) );
8716 const IntrinsicType a2( A.load(i+
IT::size,k) );
8717 const IntrinsicType b1(
set( B(k,j ) ) );
8718 const IntrinsicType b2(
set( B(k,j+1UL) ) );
8719 xmm1 = xmm1 + a1 * b1;
8720 xmm2 = xmm2 + a2 * b1;
8721 xmm3 = xmm3 + a1 * b2;
8722 xmm4 = xmm4 + a2 * b2;
8725 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8727 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8733 const size_t kbegin( ( IsLower<MT5>::value )
8734 ?( ( IsUpper<MT4>::value )
8735 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8736 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8737 :( IsUpper<MT4>::value ? i : 0UL ) );
8738 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
8740 IntrinsicType xmm1, xmm2;
8742 for(
size_t k=kbegin; k<kend; ++k ) {
8743 const IntrinsicType b1(
set( B(k,j) ) );
8744 xmm1 = xmm1 + A.load(i ,k) * b1;
8745 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
8748 (~C).
store( i , j, (~C).load(i ,j) + xmm1 * factor );
8757 for( ; (j+2UL) <= N; j+=2UL )
8759 const size_t kbegin( ( IsLower<MT5>::value )
8760 ?( ( IsUpper<MT4>::value )
8761 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8762 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8763 :( IsUpper<MT4>::value ? i : 0UL ) );
8764 const size_t kend( ( IsUpper<MT5>::value )
8765 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8768 IntrinsicType xmm1, xmm2;
8770 for(
size_t k=kbegin; k<kend; ++k ) {
8771 const IntrinsicType a1( A.load(i,k) );
8772 xmm1 = xmm1 + a1 *
set( B(k,j ) );
8773 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
8776 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
8777 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8782 const size_t kbegin( ( IsLower<MT5>::value )
8783 ?( ( IsUpper<MT4>::value )
8784 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8785 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8786 :( IsUpper<MT4>::value ? i : 0UL ) );
8790 for(
size_t k=kbegin; k<K; ++k ) {
8791 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
8794 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
8814 template<
typename MT3
8818 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8819 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8821 selectDefaultAddAssignKernel( C, A, B, scalar );
8840 template<
typename MT3
8844 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
8845 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8847 typedef IntrinsicTrait<ElementType> IT;
8849 const size_t M( A.rows() );
8850 const size_t N( B.columns() );
8851 const size_t K( A.columns() );
8853 const size_t iblock( 64UL );
8854 const size_t jblock( 128UL );
8855 const size_t kblock( 128UL );
8857 const IntrinsicType factor(
set( scalar ) );
8859 for(
size_t jj=0UL; jj<N; jj+=jblock )
8861 const size_t jend(
min( jj+jblock, N ) );
8863 for(
size_t ii=0UL; ii<M; ii+=iblock )
8865 const size_t iend(
min( ii+iblock, M ) );
8867 for(
size_t kk=0UL; kk<K; kk+=kblock )
8869 const size_t ktmp(
min( kk+kblock, K ) );
8881 for( ; (i+2UL) <= iend; i+=2UL )
8883 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8884 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8885 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8886 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
8888 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8890 for(
size_t k=kbegin; k<kend; ++k ) {
8891 const IntrinsicType a1(
set( A(i ,k) ) );
8892 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8893 const IntrinsicType b1( B.load(k,j ) );
8894 const IntrinsicType b2( B.load(k,j1) );
8895 const IntrinsicType b3( B.load(k,j2) );
8896 const IntrinsicType b4( B.load(k,j3) );
8897 xmm1 = xmm1 + a1 * b1;
8898 xmm2 = xmm2 + a1 * b2;
8899 xmm3 = xmm3 + a1 * b3;
8900 xmm4 = xmm4 + a1 * b4;
8901 xmm5 = xmm5 + a2 * b1;
8902 xmm6 = xmm6 + a2 * b2;
8903 xmm7 = xmm7 + a2 * b3;
8904 xmm8 = xmm8 + a2 * b4;
8907 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8908 (~C).
store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8909 (~C).
store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8910 (~C).
store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8911 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8912 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8913 (~C).
store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8914 (~C).
store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8919 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8920 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8921 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8922 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
8924 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8926 for(
size_t k=kbegin; k<kend; ++k ) {
8927 const IntrinsicType a1(
set( A(i,k) ) );
8928 xmm1 = xmm1 + a1 * B.load(k,j );
8929 xmm2 = xmm2 + a1 * B.load(k,j1);
8930 xmm3 = xmm3 + a1 * B.load(k,j2);
8931 xmm4 = xmm4 + a1 * B.load(k,j3);
8934 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
8935 (~C).
store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8936 (~C).
store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8937 (~C).
store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8947 for( ; (i+4UL) <= iend; i+=4UL )
8949 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8950 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8951 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8952 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
8954 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8956 for(
size_t k=kbegin; k<kend; ++k ) {
8957 const IntrinsicType a1(
set( A(i ,k) ) );
8958 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8959 const IntrinsicType a3(
set( A(i+2UL,k) ) );
8960 const IntrinsicType a4(
set( A(i+3UL,k) ) );
8961 const IntrinsicType b1( B.load(k,j ) );
8962 const IntrinsicType b2( B.load(k,j1) );
8963 xmm1 = xmm1 + a1 * b1;
8964 xmm2 = xmm2 + a1 * b2;
8965 xmm3 = xmm3 + a2 * b1;
8966 xmm4 = xmm4 + a2 * b2;
8967 xmm5 = xmm5 + a3 * b1;
8968 xmm6 = xmm6 + a3 * b2;
8969 xmm7 = xmm7 + a4 * b1;
8970 xmm8 = xmm8 + a4 * b2;
8973 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8974 (~C).
store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8975 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8976 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
8977 (~C).
store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
8978 (~C).
store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
8979 (~C).
store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
8980 (~C).
store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
8983 for( ; (i+2UL) <= iend; i+=2UL )
8985 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8986 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8987 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8988 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
8990 IntrinsicType xmm1, xmm2, xmm3, xmm4;
8992 for(
size_t k=kbegin; k<kend; ++k ) {
8993 const IntrinsicType a1(
set( A(i ,k) ) );
8994 const IntrinsicType a2(
set( A(i+1UL,k) ) );
8995 const IntrinsicType b1( B.load(k,j ) );
8996 const IntrinsicType b2( B.load(k,j1) );
8997 xmm1 = xmm1 + a1 * b1;
8998 xmm2 = xmm2 + a1 * b2;
8999 xmm3 = xmm3 + a2 * b1;
9000 xmm4 = xmm4 + a2 * b2;
9003 (~C).
store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9004 (~C).
store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9005 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9006 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9011 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9012 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9013 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9014 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
9016 IntrinsicType xmm1, xmm2;
9018 for(
size_t k=kbegin; k<kend; ++k ) {
9019 const IntrinsicType a1(
set( A(i,k) ) );
9020 xmm1 = xmm1 + a1 * B.load(k,j );
9021 xmm2 = xmm2 + a1 * B.load(k,j1);
9024 (~C).
store( i, j , (~C).load(i,j ) + xmm1 * factor );
9025 (~C).
store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9031 for(
size_t i=ii; i<iend; ++i )
9033 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9034 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9035 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9036 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
9040 for(
size_t k=kbegin; k<kend; ++k ) {
9041 const IntrinsicType a1(
set( A(i,k) ) );
9042 xmm1 = xmm1 + a1 * B.load(k,j);
9045 (~C).
store( i, j, (~C).load(i,j) + xmm1 * factor );
9069 template<
typename MT3
9073 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9074 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9076 typedef IntrinsicTrait<ElementType> IT;
9078 const size_t M( A.rows() );
9079 const size_t N( B.columns() );
9080 const size_t K( A.columns() );
9082 const size_t iblock( 128UL );
9083 const size_t jblock( 64UL );
9084 const size_t kblock( 128UL );
9086 const IntrinsicType factor(
set( scalar ) );
9088 for(
size_t ii=0UL; ii<M; ii+=iblock )
9090 const size_t iend(
min( ii+iblock, M ) );
9092 for(
size_t jj=0UL; jj<N; jj+=jblock )
9094 const size_t jend(
min( jj+jblock, N ) );
9096 for(
size_t kk=0UL; kk<K; kk+=kblock )
9098 const size_t ktmp(
min( kk+kblock, K ) );
9110 for( ; (j+2UL) <= jend; j+=2UL )
9112 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9113 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9114 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
9115 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9117 IntrinsicType xmm1( (~C).
load(i ,j ) );
9118 IntrinsicType xmm2( (~C).
load(i1,j ) );
9119 IntrinsicType xmm3( (~C).
load(i2,j ) );
9120 IntrinsicType xmm4( (~C).
load(i3,j ) );
9121 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
9122 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
9123 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
9124 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
9126 for(
size_t k=kbegin; k<kend; ++k ) {
9127 const IntrinsicType a1( A.load(i ,k) );
9128 const IntrinsicType a2( A.load(i1,k) );
9129 const IntrinsicType a3( A.load(i2,k) );
9130 const IntrinsicType a4( A.load(i3,k) );
9131 const IntrinsicType b1(
set( B(k,j ) ) );
9132 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9133 xmm1 = xmm1 + a1 * b1;
9134 xmm2 = xmm2 + a2 * b1;
9135 xmm3 = xmm3 + a3 * b1;
9136 xmm4 = xmm4 + a4 * b1;
9137 xmm5 = xmm5 + a1 * b2;
9138 xmm6 = xmm6 + a2 * b2;
9139 xmm7 = xmm7 + a3 * b2;
9140 xmm8 = xmm8 + a4 * b2;
9143 (~C).
store( i , j , xmm1 * factor );
9144 (~C).
store( i1, j , xmm2 * factor );
9145 (~C).
store( i2, j , xmm3 * factor );
9146 (~C).
store( i3, j , xmm4 * factor );
9147 (~C).
store( i , j+1UL, xmm5 * factor );
9148 (~C).
store( i1, j+1UL, xmm6 * factor );
9149 (~C).
store( i2, j+1UL, xmm7 * factor );
9150 (~C).
store( i3, j+1UL, xmm8 * factor );
9155 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9156 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9157 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
9158 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9160 IntrinsicType xmm1( (~C).
load(i ,j) );
9161 IntrinsicType xmm2( (~C).
load(i1,j) );
9162 IntrinsicType xmm3( (~C).
load(i2,j) );
9163 IntrinsicType xmm4( (~C).
load(i3,j) );
9165 for(
size_t k=kbegin; k<kend; ++k ) {
9166 const IntrinsicType b1(
set( B(k,j) ) );
9167 xmm1 = xmm1 + A.load(i ,k) * b1;
9168 xmm2 = xmm2 + A.load(i1,k) * b1;
9169 xmm3 = xmm3 + A.load(i2,k) * b1;
9170 xmm4 = xmm4 + A.load(i3,k) * b1;
9173 (~C).
store( i , j, xmm1 * factor );
9174 (~C).
store( i1, j, xmm2 * factor );
9175 (~C).
store( i2, j, xmm3 * factor );
9176 (~C).
store( i3, j, xmm4 * factor );
9186 for( ; (j+4UL) <= jend; j+=4UL )
9188 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9189 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9190 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9191 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9193 IntrinsicType xmm1( (~C).
load(i ,j ) );
9194 IntrinsicType xmm2( (~C).
load(i1,j ) );
9195 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
9196 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
9197 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
9198 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
9199 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
9200 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
9202 for(
size_t k=kbegin; k<kend; ++k ) {
9203 const IntrinsicType a1( A.load(i ,k) );
9204 const IntrinsicType a2( A.load(i1,k) );
9205 const IntrinsicType b1(
set( B(k,j ) ) );
9206 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9207 const IntrinsicType b3(
set( B(k,j+2UL) ) );
9208 const IntrinsicType b4(
set( B(k,j+3UL) ) );
9209 xmm1 = xmm1 + a1 * b1;
9210 xmm2 = xmm2 + a2 * b1;
9211 xmm3 = xmm3 + a1 * b2;
9212 xmm4 = xmm4 + a2 * b2;
9213 xmm5 = xmm5 + a1 * b3;
9214 xmm6 = xmm6 + a2 * b3;
9215 xmm7 = xmm7 + a1 * b4;
9216 xmm8 = xmm8 + a2 * b4;
9219 (~C).
store( i , j , xmm1 * factor );
9220 (~C).
store( i1, j , xmm2 * factor );
9221 (~C).
store( i , j+1UL, xmm3 * factor );
9222 (~C).
store( i1, j+1UL, xmm4 * factor );
9223 (~C).
store( i , j+2UL, xmm5 * factor );
9224 (~C).
store( i1, j+2UL, xmm6 * factor );
9225 (~C).
store( i , j+3UL, xmm7 * factor );
9226 (~C).
store( i1, j+3UL, xmm8 * factor );
9229 for( ; (j+2UL) <= jend; j+=2UL )
9231 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9232 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9233 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9234 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9236 IntrinsicType xmm1( (~C).
load(i ,j ) );
9237 IntrinsicType xmm2( (~C).
load(i1,j ) );
9238 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
9239 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
9241 for(
size_t k=kbegin; k<kend; ++k ) {
9242 const IntrinsicType a1( A.load(i ,k) );
9243 const IntrinsicType a2( A.load(i1,k) );
9244 const IntrinsicType b1(
set( B(k,j ) ) );
9245 const IntrinsicType b2(
set( B(k,j+1UL) ) );
9246 xmm1 = xmm1 + a1 * b1;
9247 xmm2 = xmm2 + a2 * b1;
9248 xmm3 = xmm3 + a1 * b2;
9249 xmm4 = xmm4 + a2 * b2;
9252 (~C).
store( i , j , xmm1 * factor );
9253 (~C).
store( i1, j , xmm2 * factor );
9254 (~C).
store( i , j+1UL, xmm3 * factor );
9255 (~C).
store( i1, j+1UL, xmm4 * factor );
9260 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9261 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9262 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
9263 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9265 IntrinsicType xmm1( (~C).
load(i ,j) );
9266 IntrinsicType xmm2( (~C).
load(i1,j) );
9268 for(
size_t k=kbegin; k<kend; ++k ) {
9269 const IntrinsicType b1(
set( B(k,j) ) );
9270 xmm1 = xmm1 + A.load(i ,k) * b1;
9271 xmm2 = xmm2 + A.load(i1,k) * b1;
9274 (~C).
store( i , j, xmm1 * factor );
9275 (~C).
store( i1, j, xmm2 * factor );
9281 for(
size_t j=jj; j<jend; ++j )
9283 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9284 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9285 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
9286 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9288 IntrinsicType xmm1( (~C).
load(i,j) );
9290 for(
size_t k=kbegin; k<kend; ++k ) {
9291 const IntrinsicType b1(
set( B(k,j) ) );
9292 xmm1 = xmm1 + A.load(i,k) * b1;
9295 (~C).
store( i, j, xmm1 * factor );
9318 template<
typename MT3
9322 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9323 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9325 selectLargeAddAssignKernel( C, A, B, scalar );
9344 template<
typename MT3
9348 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
9349 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9351 if( IsTriangular<MT4>::value ) {
9353 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
9356 else if( IsTriangular<MT5>::value ) {
9358 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
9362 sgemm( C, A, B, scalar, 1.0F );
9383 template<
typename MT3
9387 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
9388 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9390 if( IsTriangular<MT4>::value ) {
9392 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
9395 else if( IsTriangular<MT5>::value ) {
9397 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
9401 dgemm( C, A, B, scalar, 1.0 );
9422 template<
typename MT3
9426 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
9427 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9429 if( IsTriangular<MT4>::value ) {
9431 ctrmm( tmp, A, CblasLeft,
9432 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
9433 complex<float>( scalar, 0.0F ) );
9436 else if( IsTriangular<MT5>::value ) {
9438 ctrmm( tmp, B, CblasRight,
9439 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
9440 complex<float>( scalar, 0.0F ) );
9444 cgemm( C, A, B, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
9465 template<
typename MT3
9469 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
9470 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9472 if( IsTriangular<MT4>::value ) {
9474 ztrmm( tmp, A, CblasLeft,
9475 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
9476 complex<double>( scalar, 0.0 ) );
9479 else if( IsTriangular<MT5>::value ) {
9481 ztrmm( tmp, B, CblasRight,
9482 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
9483 complex<double>( scalar, 0.0 ) );
9487 zgemm( C, A, B, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
9509 template<
typename MT
9511 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9518 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
9519 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
9521 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9535 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9550 template<
typename MT3
9554 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9556 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9558 selectSmallSubAssignKernel( C, A, B, scalar );
9560 selectBlasSubAssignKernel( C, A, B, scalar );
9578 template<
typename MT3
9582 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
9583 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9585 const ResultType tmp(
serial( A * B * scalar ) );
9604 template<
typename MT3
9608 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9609 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9611 const size_t M( A.rows() );
9612 const size_t N( B.columns() );
9614 const size_t block( 16UL );
9616 for(
size_t ii=0UL; ii<M; ii+=block ) {
9617 const size_t iend(
min( M, ii+block ) );
9618 for(
size_t jj=0UL; jj<N; jj+=block ) {
9619 const size_t jend(
min( N, jj+block ) );
9620 for(
size_t i=ii; i<iend; ++i )
9622 const size_t jbegin( ( IsUpper<MT4>::value )
9623 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9625 const size_t jpos( ( IsLower<MT4>::value )
9626 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9629 for(
size_t j=jbegin; j<jpos; ++j ) {
9630 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9652 template<
typename MT3
9656 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
9657 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9659 const size_t M( A.rows() );
9660 const size_t N( B.columns() );
9662 for(
size_t j=0UL; j<N; ++j )
9664 const size_t ibegin( ( IsLower<MT4>::value )
9665 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9667 const size_t iend( ( IsUpper<MT4>::value )
9668 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9672 const size_t inum( iend - ibegin );
9673 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9675 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9676 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9677 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9680 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9700 template<
typename MT3
9704 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9705 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9707 const size_t M( A.rows() );
9708 const size_t N( B.columns() );
9710 for(
size_t i=0UL; i<M; ++i )
9712 const size_t jbegin( ( IsUpper<MT5>::value )
9713 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9715 const size_t jend( ( IsLower<MT5>::value )
9716 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9720 const size_t jnum( jend - jbegin );
9721 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9723 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9724 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9725 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9728 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9748 template<
typename MT3
9752 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
9753 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9755 const size_t M( A.rows() );
9756 const size_t N( B.columns() );
9758 const size_t block( 16UL );
9760 for(
size_t jj=0UL; jj<N; jj+=block ) {
9761 const size_t jend(
min( N, jj+block ) );
9762 for(
size_t ii=0UL; ii<M; ii+=block ) {
9763 const size_t iend(
min( M, ii+block ) );
9764 for(
size_t j=jj; j<jend; ++j )
9766 const size_t ibegin( ( IsLower<MT5>::value )
9767 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9769 const size_t ipos( ( IsUpper<MT5>::value )
9770 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9773 for(
size_t i=ibegin; i<ipos; ++i ) {
9774 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9796 template<
typename MT3
9800 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
9801 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9803 for(
size_t i=0UL; i<A.rows(); ++i ) {
9804 C(i,i) -= A(i,i) * B(i,i) * scalar;
9823 template<
typename MT3
9827 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9828 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9830 selectDefaultSubAssignKernel( C, A, B, scalar );
9849 template<
typename MT3
9853 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
9854 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9856 typedef IntrinsicTrait<ElementType> IT;
9858 const size_t M( A.rows() );
9859 const size_t N( B.columns() );
9860 const size_t K( A.columns() );
9862 const IntrinsicType factor(
set( scalar ) );
9867 for(
size_t i=0UL; i<M; ++i )
9869 const size_t kbegin( ( IsUpper<MT4>::value )
9870 ?( ( IsLower<MT5>::value )
9871 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9872 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9873 :( IsLower<MT5>::value ? j : 0UL ) );
9874 const size_t kend( ( IsLower<MT4>::value )
9875 ?( ( IsUpper<MT5>::value )
9876 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
9877 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9878 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
9880 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9882 for(
size_t k=kbegin; k<kend; ++k ) {
9883 const IntrinsicType a1(
set( A(i,k) ) );
9884 xmm1 = xmm1 + a1 * B.load(k,j );
9885 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
9886 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
9887 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
9888 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
9889 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
9890 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
9891 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
9894 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
9909 for( ; (i+2UL) <= M; i+=2UL )
9911 const size_t kbegin( ( IsUpper<MT4>::value )
9912 ?( ( IsLower<MT5>::value )
9913 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9914 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9915 :( IsLower<MT5>::value ? j : 0UL ) );
9916 const size_t kend( ( IsLower<MT4>::value )
9917 ?( ( IsUpper<MT5>::value )
9918 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
9919 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9920 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
9922 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9924 for(
size_t k=kbegin; k<kend; ++k ) {
9925 const IntrinsicType a1(
set( A(i ,k) ) );
9926 const IntrinsicType a2(
set( A(i+1UL,k) ) );
9927 const IntrinsicType b1( B.load(k,j ) );
9928 const IntrinsicType b2( B.load(k,j+
IT::size ) );
9929 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
9930 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
9931 xmm1 = xmm1 + a1 * b1;
9932 xmm2 = xmm2 + a1 * b2;
9933 xmm3 = xmm3 + a1 * b3;
9934 xmm4 = xmm4 + a1 * b4;
9935 xmm5 = xmm5 + a2 * b1;
9936 xmm6 = xmm6 + a2 * b2;
9937 xmm7 = xmm7 + a2 * b3;
9938 xmm8 = xmm8 + a2 * b4;
9941 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9945 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9953 const size_t kbegin( ( IsUpper<MT4>::value )
9954 ?( ( IsLower<MT5>::value )
9955 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9956 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9957 :( IsLower<MT5>::value ? j : 0UL ) );
9958 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
9960 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9962 for(
size_t k=kbegin; k<kend; ++k ) {
9963 const IntrinsicType a1(
set( A(i,k) ) );
9964 xmm1 = xmm1 + a1 * B.load(k,j );
9965 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
9966 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
9967 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
9970 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
9981 for( ; (i+2UL) <= M; i+=2UL )
9983 const size_t kbegin( ( IsUpper<MT4>::value )
9984 ?( ( IsLower<MT5>::value )
9985 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9986 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9987 :( IsLower<MT5>::value ? j : 0UL ) );
9988 const size_t kend( ( IsLower<MT4>::value )
9989 ?( ( IsUpper<MT5>::value )
9990 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
9991 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9992 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
9994 IntrinsicType xmm1, xmm2, xmm3, xmm4;
9996 for(
size_t k=kbegin; k<kend; ++k ) {
9997 const IntrinsicType a1(
set( A(i ,k) ) );
9998 const IntrinsicType a2(
set( A(i+1UL,k) ) );
9999 const IntrinsicType b1( B.load(k,j ) );
10000 const IntrinsicType b2( B.load(k,j+
IT::size) );
10001 xmm1 = xmm1 + a1 * b1;
10002 xmm2 = xmm2 + a1 * b2;
10003 xmm3 = xmm3 + a2 * b1;
10004 xmm4 = xmm4 + a2 * b2;
10007 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10009 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10015 const size_t kbegin( ( IsUpper<MT4>::value )
10016 ?( ( IsLower<MT5>::value )
10017 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10018 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10019 :( IsLower<MT5>::value ? j : 0UL ) );
10020 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
10022 IntrinsicType xmm1, xmm2;
10024 for(
size_t k=kbegin; k<kend; ++k ) {
10025 const IntrinsicType a1(
set( A(i,k) ) );
10026 xmm1 = xmm1 + a1 * B.load(k,j );
10027 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
10030 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
10039 for( ; (i+2UL) <= M; i+=2UL )
10041 const size_t kbegin( ( IsUpper<MT4>::value )
10042 ?( ( IsLower<MT5>::value )
10043 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10044 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10045 :( IsLower<MT5>::value ? j : 0UL ) );
10046 const size_t kend( ( IsLower<MT4>::value )
10047 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10050 IntrinsicType xmm1, xmm2;
10052 for(
size_t k=kbegin; k<kend; ++k ) {
10053 const IntrinsicType b1( B.load(k,j) );
10054 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
10055 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
10058 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
10059 (~C).
store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
10064 const size_t kbegin( ( IsUpper<MT4>::value )
10065 ?( ( IsLower<MT5>::value )
10066 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10067 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10068 :( IsLower<MT5>::value ? j : 0UL ) );
10070 IntrinsicType xmm1;
10072 for(
size_t k=kbegin; k<K; ++k ) {
10073 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
10076 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
10097 template<
typename MT3
10101 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10102 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10104 typedef IntrinsicTrait<ElementType> IT;
10106 const size_t M( A.rows() );
10107 const size_t N( B.columns() );
10108 const size_t K( A.columns() );
10110 const IntrinsicType factor(
set( scalar ) );
10115 for(
size_t j=0UL; j<N; ++j )
10117 const size_t kbegin( ( IsLower<MT5>::value )
10118 ?( ( IsUpper<MT4>::value )
10119 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10120 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10121 :( IsUpper<MT4>::value ? i : 0UL ) );
10122 const size_t kend( ( IsUpper<MT5>::value )
10123 ?( ( IsLower<MT4>::value )
10124 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10125 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10126 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
10128 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10130 for(
size_t k=kbegin; k<kend; ++k ) {
10131 const IntrinsicType b1(
set( B(k,j) ) );
10132 xmm1 = xmm1 + A.load(i ,k) * b1;
10133 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
10134 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
10135 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
10136 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
10137 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
10138 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
10139 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
10142 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
10157 for( ; (j+2UL) <= N; j+=2UL )
10159 const size_t kbegin( ( IsLower<MT5>::value )
10160 ?( ( IsUpper<MT4>::value )
10161 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10162 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10163 :( IsUpper<MT4>::value ? i : 0UL ) );
10164 const size_t kend( ( IsUpper<MT5>::value )
10165 ?( ( IsLower<MT4>::value )
10166 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10167 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10168 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
10170 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10172 for(
size_t k=kbegin; k<kend; ++k ) {
10173 const IntrinsicType a1( A.load(i ,k) );
10174 const IntrinsicType a2( A.load(i+
IT::size ,k) );
10175 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
10176 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
10177 const IntrinsicType b1(
set( B(k,j ) ) );
10178 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10179 xmm1 = xmm1 + a1 * b1;
10180 xmm2 = xmm2 + a2 * b1;
10181 xmm3 = xmm3 + a3 * b1;
10182 xmm4 = xmm4 + a4 * b1;
10183 xmm5 = xmm5 + a1 * b2;
10184 xmm6 = xmm6 + a2 * b2;
10185 xmm7 = xmm7 + a3 * b2;
10186 xmm8 = xmm8 + a4 * b2;
10189 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10193 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10201 const size_t kbegin( ( IsLower<MT5>::value )
10202 ?( ( IsUpper<MT4>::value )
10203 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10204 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10205 :( IsUpper<MT4>::value ? i : 0UL ) );
10206 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
10208 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10210 for(
size_t k=kbegin; k<kend; ++k ) {
10211 const IntrinsicType b1(
set( B(k,j) ) );
10212 xmm1 = xmm1 + A.load(i ,k) * b1;
10213 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
10214 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
10215 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
10218 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
10229 for( ; (j+2UL) <= N; j+=2UL )
10231 const size_t kbegin( ( IsLower<MT5>::value )
10232 ?( ( IsUpper<MT4>::value )
10233 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10234 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10235 :( IsUpper<MT4>::value ? i : 0UL ) );
10236 const size_t kend( ( IsUpper<MT5>::value )
10237 ?( ( IsLower<MT4>::value )
10238 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10239 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10240 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
10242 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10244 for(
size_t k=kbegin; k<kend; ++k ) {
10245 const IntrinsicType a1( A.load(i ,k) );
10246 const IntrinsicType a2( A.load(i+
IT::size,k) );
10247 const IntrinsicType b1(
set( B(k,j ) ) );
10248 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10249 xmm1 = xmm1 + a1 * b1;
10250 xmm2 = xmm2 + a2 * b1;
10251 xmm3 = xmm3 + a1 * b2;
10252 xmm4 = xmm4 + a2 * b2;
10255 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10257 (~C).
store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10263 const size_t kbegin( ( IsLower<MT5>::value )
10264 ?( ( IsUpper<MT4>::value )
10265 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10266 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10267 :( IsUpper<MT4>::value ? i : 0UL ) );
10268 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
10270 IntrinsicType xmm1, xmm2;
10272 for(
size_t k=kbegin; k<kend; ++k ) {
10273 const IntrinsicType b1(
set( B(k,j) ) );
10274 xmm1 = xmm1 + A.load(i ,k) * b1;
10275 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
10278 (~C).
store( i , j, (~C).load(i ,j) - xmm1 * factor );
10287 for( ; (j+2UL) <= N; j+=2UL )
10289 const size_t kbegin( ( IsLower<MT5>::value )
10290 ?( ( IsUpper<MT4>::value )
10291 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10292 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10293 :( IsUpper<MT4>::value ? i : 0UL ) );
10294 const size_t kend( ( IsUpper<MT5>::value )
10295 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10298 IntrinsicType xmm1, xmm2;
10300 for(
size_t k=kbegin; k<kend; ++k ) {
10301 const IntrinsicType a1( A.load(i,k) );
10302 xmm1 = xmm1 + a1 *
set( B(k,j ) );
10303 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
10306 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
10307 (~C).
store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10312 const size_t kbegin( ( IsLower<MT5>::value )
10313 ?( ( IsUpper<MT4>::value )
10314 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10315 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10316 :( IsUpper<MT4>::value ? i : 0UL ) );
10318 IntrinsicType xmm1;
10320 for(
size_t k=kbegin; k<K; ++k ) {
10321 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
10324 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
10344 template<
typename MT3
10348 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10349 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10351 selectDefaultSubAssignKernel( C, A, B, scalar );
10370 template<
typename MT3
10374 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10375 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10377 typedef IntrinsicTrait<ElementType> IT;
10379 const size_t M( A.rows() );
10380 const size_t N( B.columns() );
10381 const size_t K( A.columns() );
10383 const size_t iblock( 64UL );
10384 const size_t jblock( 128UL );
10385 const size_t kblock( 128UL );
10387 const IntrinsicType factor(
set( scalar ) );
10389 for(
size_t jj=0UL; jj<N; jj+=jblock )
10391 const size_t jend(
min( jj+jblock, N ) );
10393 for(
size_t ii=0UL; ii<M; ii+=iblock )
10395 const size_t iend(
min( ii+iblock, M ) );
10397 for(
size_t kk=0UL; kk<K; kk+=kblock )
10399 const size_t ktmp(
min( kk+kblock, K ) );
10406 const size_t j2( j+
IT::size*2UL );
10407 const size_t j3( j+
IT::size*3UL );
10411 for( ; (i+2UL) <= iend; i+=2UL )
10413 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10414 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10415 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10416 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
10418 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10420 for(
size_t k=kbegin; k<kend; ++k ) {
10421 const IntrinsicType a1(
set( A(i ,k) ) );
10422 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10423 const IntrinsicType b1( B.load(k,j ) );
10424 const IntrinsicType b2( B.load(k,j1) );
10425 const IntrinsicType b3( B.load(k,j2) );
10426 const IntrinsicType b4( B.load(k,j3) );
10427 xmm1 = xmm1 + a1 * b1;
10428 xmm2 = xmm2 + a1 * b2;
10429 xmm3 = xmm3 + a1 * b3;
10430 xmm4 = xmm4 + a1 * b4;
10431 xmm5 = xmm5 + a2 * b1;
10432 xmm6 = xmm6 + a2 * b2;
10433 xmm7 = xmm7 + a2 * b3;
10434 xmm8 = xmm8 + a2 * b4;
10437 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10438 (~C).
store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10439 (~C).
store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10440 (~C).
store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10441 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10442 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10443 (~C).
store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10444 (~C).
store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10449 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10450 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10451 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10452 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
10454 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10456 for(
size_t k=kbegin; k<kend; ++k ) {
10457 const IntrinsicType a1(
set( A(i,k) ) );
10458 xmm1 = xmm1 + a1 * B.load(k,j );
10459 xmm2 = xmm2 + a1 * B.load(k,j1);
10460 xmm3 = xmm3 + a1 * B.load(k,j2);
10461 xmm4 = xmm4 + a1 * B.load(k,j3);
10464 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
10465 (~C).
store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10466 (~C).
store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10467 (~C).
store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10477 for( ; (i+4UL) <= iend; i+=4UL )
10479 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10480 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10481 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10482 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10484 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10486 for(
size_t k=kbegin; k<kend; ++k ) {
10487 const IntrinsicType a1(
set( A(i ,k) ) );
10488 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10489 const IntrinsicType a3(
set( A(i+2UL,k) ) );
10490 const IntrinsicType a4(
set( A(i+3UL,k) ) );
10491 const IntrinsicType b1( B.load(k,j ) );
10492 const IntrinsicType b2( B.load(k,j1) );
10493 xmm1 = xmm1 + a1 * b1;
10494 xmm2 = xmm2 + a1 * b2;
10495 xmm3 = xmm3 + a2 * b1;
10496 xmm4 = xmm4 + a2 * b2;
10497 xmm5 = xmm5 + a3 * b1;
10498 xmm6 = xmm6 + a3 * b2;
10499 xmm7 = xmm7 + a4 * b1;
10500 xmm8 = xmm8 + a4 * b2;
10503 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10504 (~C).
store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10505 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10506 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10507 (~C).
store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10508 (~C).
store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10509 (~C).
store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10510 (~C).
store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10513 for( ; (i+2UL) <= iend; i+=2UL )
10515 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10516 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10517 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10518 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10520 IntrinsicType xmm1, xmm2, xmm3, xmm4;
10522 for(
size_t k=kbegin; k<kend; ++k ) {
10523 const IntrinsicType a1(
set( A(i ,k) ) );
10524 const IntrinsicType a2(
set( A(i+1UL,k) ) );
10525 const IntrinsicType b1( B.load(k,j ) );
10526 const IntrinsicType b2( B.load(k,j1) );
10527 xmm1 = xmm1 + a1 * b1;
10528 xmm2 = xmm2 + a1 * b2;
10529 xmm3 = xmm3 + a2 * b1;
10530 xmm4 = xmm4 + a2 * b2;
10533 (~C).
store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10534 (~C).
store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10535 (~C).
store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10536 (~C).
store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10541 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10542 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10543 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10544 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
10546 IntrinsicType xmm1, xmm2;
10548 for(
size_t k=kbegin; k<kend; ++k ) {
10549 const IntrinsicType a1(
set( A(i,k) ) );
10550 xmm1 = xmm1 + a1 * B.load(k,j );
10551 xmm2 = xmm2 + a1 * B.load(k,j1);
10554 (~C).
store( i, j , (~C).load(i,j ) - xmm1 * factor );
10555 (~C).
store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10561 for(
size_t i=ii; i<iend; ++i )
10563 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10564 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10565 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10566 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
10568 IntrinsicType xmm1;
10570 for(
size_t k=kbegin; k<kend; ++k ) {
10571 const IntrinsicType a1(
set( A(i,k) ) );
10572 xmm1 = xmm1 + a1 * B.load(k,j);
10575 (~C).
store( i, j, (~C).load(i,j) - xmm1 * factor );
10599 template<
typename MT3
10603 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10604 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10606 typedef IntrinsicTrait<ElementType> IT;
10608 const size_t M( A.rows() );
10609 const size_t N( B.columns() );
10610 const size_t K( A.columns() );
10612 const size_t iblock( 128UL );
10613 const size_t jblock( 64UL );
10614 const size_t kblock( 128UL );
10616 const IntrinsicType factor(
set( scalar ) );
10618 for(
size_t ii=0UL; ii<M; ii+=iblock )
10620 const size_t iend(
min( ii+iblock, M ) );
10622 for(
size_t jj=0UL; jj<N; jj+=jblock )
10624 const size_t jend(
min( jj+jblock, N ) );
10626 for(
size_t kk=0UL; kk<K; kk+=kblock )
10628 const size_t ktmp(
min( kk+kblock, K ) );
10635 const size_t i2( i+
IT::size*2UL );
10636 const size_t i3( i+
IT::size*3UL );
10640 for( ; (j+2UL) <= jend; j+=2UL )
10642 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10643 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10644 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
10645 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10647 IntrinsicType xmm1( (~C).
load(i ,j ) );
10648 IntrinsicType xmm2( (~C).
load(i1,j ) );
10649 IntrinsicType xmm3( (~C).
load(i2,j ) );
10650 IntrinsicType xmm4( (~C).
load(i3,j ) );
10651 IntrinsicType xmm5( (~C).
load(i ,j+1UL) );
10652 IntrinsicType xmm6( (~C).
load(i1,j+1UL) );
10653 IntrinsicType xmm7( (~C).
load(i2,j+1UL) );
10654 IntrinsicType xmm8( (~C).
load(i3,j+1UL) );
10656 for(
size_t k=kbegin; k<kend; ++k ) {
10657 const IntrinsicType a1( A.load(i ,k) );
10658 const IntrinsicType a2( A.load(i1,k) );
10659 const IntrinsicType a3( A.load(i2,k) );
10660 const IntrinsicType a4( A.load(i3,k) );
10661 const IntrinsicType b1(
set( B(k,j ) ) );
10662 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10663 xmm1 = xmm1 - a1 * b1;
10664 xmm2 = xmm2 - a2 * b1;
10665 xmm3 = xmm3 - a3 * b1;
10666 xmm4 = xmm4 - a4 * b1;
10667 xmm5 = xmm5 - a1 * b2;
10668 xmm6 = xmm6 - a2 * b2;
10669 xmm7 = xmm7 - a3 * b2;
10670 xmm8 = xmm8 - a4 * b2;
10673 (~C).
store( i , j , xmm1 * factor );
10674 (~C).
store( i1, j , xmm2 * factor );
10675 (~C).
store( i2, j , xmm3 * factor );
10676 (~C).
store( i3, j , xmm4 * factor );
10677 (~C).
store( i , j+1UL, xmm5 * factor );
10678 (~C).
store( i1, j+1UL, xmm6 * factor );
10679 (~C).
store( i2, j+1UL, xmm7 * factor );
10680 (~C).
store( i3, j+1UL, xmm8 * factor );
10685 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10686 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10687 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
10688 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10690 IntrinsicType xmm1( (~C).
load(i ,j) );
10691 IntrinsicType xmm2( (~C).
load(i1,j) );
10692 IntrinsicType xmm3( (~C).
load(i2,j) );
10693 IntrinsicType xmm4( (~C).
load(i3,j) );
10695 for(
size_t k=kbegin; k<kend; ++k ) {
10696 const IntrinsicType b1(
set( B(k,j) ) );
10697 xmm1 = xmm1 - A.load(i ,k) * b1;
10698 xmm2 = xmm2 - A.load(i1,k) * b1;
10699 xmm3 = xmm3 - A.load(i2,k) * b1;
10700 xmm4 = xmm4 - A.load(i3,k) * b1;
10703 (~C).
store( i , j, xmm1 * factor );
10704 (~C).
store( i1, j, xmm2 * factor );
10705 (~C).
store( i2, j, xmm3 * factor );
10706 (~C).
store( i3, j, xmm4 * factor );
10716 for( ; (j+4UL) <= jend; j+=4UL )
10718 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10719 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10720 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10721 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10723 IntrinsicType xmm1( (~C).
load(i ,j ) );
10724 IntrinsicType xmm2( (~C).
load(i1,j ) );
10725 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
10726 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
10727 IntrinsicType xmm5( (~C).
load(i ,j+2UL) );
10728 IntrinsicType xmm6( (~C).
load(i1,j+2UL) );
10729 IntrinsicType xmm7( (~C).
load(i ,j+3UL) );
10730 IntrinsicType xmm8( (~C).
load(i1,j+3UL) );
10732 for(
size_t k=kbegin; k<kend; ++k ) {
10733 const IntrinsicType a1( A.load(i ,k) );
10734 const IntrinsicType a2( A.load(i1,k) );
10735 const IntrinsicType b1(
set( B(k,j ) ) );
10736 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10737 const IntrinsicType b3(
set( B(k,j+2UL) ) );
10738 const IntrinsicType b4(
set( B(k,j+3UL) ) );
10739 xmm1 = xmm1 - a1 * b1;
10740 xmm2 = xmm2 - a2 * b1;
10741 xmm3 = xmm3 - a1 * b2;
10742 xmm4 = xmm4 - a2 * b2;
10743 xmm5 = xmm5 - a1 * b3;
10744 xmm6 = xmm6 - a2 * b3;
10745 xmm7 = xmm7 - a1 * b4;
10746 xmm8 = xmm8 - a2 * b4;
10749 (~C).
store( i , j , xmm1 * factor );
10750 (~C).
store( i1, j , xmm2 * factor );
10751 (~C).
store( i , j+1UL, xmm3 * factor );
10752 (~C).
store( i1, j+1UL, xmm4 * factor );
10753 (~C).
store( i , j+2UL, xmm5 * factor );
10754 (~C).
store( i1, j+2UL, xmm6 * factor );
10755 (~C).
store( i , j+3UL, xmm7 * factor );
10756 (~C).
store( i1, j+3UL, xmm8 * factor );
10759 for( ; (j+2UL) <= jend; j+=2UL )
10761 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10762 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10763 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10764 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10766 IntrinsicType xmm1( (~C).
load(i ,j ) );
10767 IntrinsicType xmm2( (~C).
load(i1,j ) );
10768 IntrinsicType xmm3( (~C).
load(i ,j+1UL) );
10769 IntrinsicType xmm4( (~C).
load(i1,j+1UL) );
10771 for(
size_t k=kbegin; k<kend; ++k ) {
10772 const IntrinsicType a1( A.load(i ,k) );
10773 const IntrinsicType a2( A.load(i1,k) );
10774 const IntrinsicType b1(
set( B(k,j ) ) );
10775 const IntrinsicType b2(
set( B(k,j+1UL) ) );
10776 xmm1 = xmm1 - a1 * b1;
10777 xmm2 = xmm2 - a2 * b1;
10778 xmm3 = xmm3 - a1 * b2;
10779 xmm4 = xmm4 - a2 * b2;
10782 (~C).
store( i , j , xmm1 * factor );
10783 (~C).
store( i1, j , xmm2 * factor );
10784 (~C).
store( i , j+1UL, xmm3 * factor );
10785 (~C).
store( i1, j+1UL, xmm4 * factor );
10790 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10791 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10792 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
10793 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10795 IntrinsicType xmm1( (~C).
load(i ,j) );
10796 IntrinsicType xmm2( (~C).
load(i1,j) );
10798 for(
size_t k=kbegin; k<kend; ++k ) {
10799 const IntrinsicType b1(
set( B(k,j) ) );
10800 xmm1 = xmm1 - A.load(i ,k) * b1;
10801 xmm2 = xmm2 - A.load(i1,k) * b1;
10804 (~C).
store( i , j, xmm1 * factor );
10805 (~C).
store( i1, j, xmm2 * factor );
10811 for(
size_t j=jj; j<jend; ++j )
10813 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10814 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10815 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
10816 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10818 IntrinsicType xmm1( (~C).
load(i,j) );
10820 for(
size_t k=kbegin; k<kend; ++k ) {
10821 const IntrinsicType b1(
set( B(k,j) ) );
10822 xmm1 = xmm1 - A.load(i,k) * b1;
10825 (~C).
store( i, j, xmm1 * factor );
10848 template<
typename MT3
10852 static inline typename EnableIf< UseDefaultKernel<MT3,MT4,MT5,ST2> >::Type
10853 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10855 selectLargeSubAssignKernel( C, A, B, scalar );
10860 #if BLAZE_BLAS_MODE
10874 template<
typename MT3
10878 static inline typename EnableIf< UseSinglePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
10879 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10881 if( IsTriangular<MT4>::value ) {
10883 strmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
10886 else if( IsTriangular<MT5>::value ) {
10888 strmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
10892 sgemm( C, A, B, -scalar, 1.0F );
10899 #if BLAZE_BLAS_MODE
10913 template<
typename MT3
10917 static inline typename EnableIf< UseDoublePrecisionKernel<MT3,MT4,MT5,ST2> >::Type
10918 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10920 if( IsTriangular<MT4>::value ) {
10922 dtrmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), scalar );
10925 else if( IsTriangular<MT5>::value ) {
10927 dtrmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), scalar );
10931 dgemm( C, A, B, -scalar, 1.0 );
10938 #if BLAZE_BLAS_MODE
10952 template<
typename MT3
10956 static inline typename EnableIf< UseSinglePrecisionComplexKernel<MT3,MT4,MT5> >::Type
10957 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10959 if( IsTriangular<MT4>::value ) {
10961 ctrmm( tmp, A, CblasLeft,
10962 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
10963 complex<float>( scalar, 0.0F ) );
10966 else if( IsTriangular<MT5>::value ) {
10968 ctrmm( tmp, B, CblasRight,
10969 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
10970 complex<float>( scalar, 0.0F ) );
10974 cgemm( C, A, B, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
10981 #if BLAZE_BLAS_MODE
10995 template<
typename MT3
10999 static inline typename EnableIf< UseDoublePrecisionComplexKernel<MT3,MT4,MT5> >::Type
11000 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11002 if( IsTriangular<MT4>::value ) {
11004 ztrmm( tmp, A, CblasLeft,
11005 ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ),
11006 complex<float>( scalar, 0.0 ) );
11009 else if( IsTriangular<MT5>::value ) {
11011 ztrmm( tmp, B, CblasRight,
11012 ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ),
11013 complex<float>( scalar, 0.0 ) );
11017 zgemm( C, A, B, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
11050 template<
typename MT
11052 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11053 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11060 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11061 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11063 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
11066 else if( left.columns() == 0UL ) {
11081 smpAssign( ~lhs, A * B * rhs.scalar_ );
11100 template<
typename MT
11102 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11103 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11107 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
11119 const TmpType tmp( rhs );
11139 template<
typename MT
11141 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11142 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11149 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11150 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11152 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11189 template<
typename MT
11191 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
11192 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11199 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
11200 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
11202 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11284 template<
typename T1
11286 inline const TDMatDMatMultExpr<T1,T2>
11292 throw std::invalid_argument(
"Matrix sizes do not match" );
11309 template<
typename MT1,
typename MT2 >
11327 template<
typename MT1,
typename MT2 >
11329 :
public Columns<MT2>
11345 template<
typename MT1,
typename MT2 >
11347 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
11363 template<
typename MT1,
typename MT2 >
11365 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11381 template<
typename MT1,
typename MT2 >
11383 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11384 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11400 template<
typename MT1,
typename MT2 >
11402 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11418 template<
typename MT1,
typename MT2 >
11420 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11436 template<
typename MT1,
typename MT2 >
11438 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11439 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11455 template<
typename MT1,
typename MT2,
typename VT >
11460 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11461 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11462 IsDenseVector<VT>::value && IsColumnVector<VT>::value
11463 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
11464 , INVALID_TYPE >::Type Type;
11473 template<
typename MT1,
typename MT2,
typename VT >
11478 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11479 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
11480 IsSparseVector<VT>::value && IsColumnVector<VT>::value
11481 ,
typename TDMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
11482 , INVALID_TYPE >::Type Type;
11491 template<
typename VT,
typename MT1,
typename MT2 >
11496 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
11497 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11498 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11499 ,
typename TDVecDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11500 , INVALID_TYPE >::Type Type;
11509 template<
typename VT,
typename MT1,
typename MT2 >
11514 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
11515 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
11516 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
11517 ,
typename TDVecDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
11518 , INVALID_TYPE >::Type Type;
11527 template<
typename MT1,
typename MT2,
bool AF >
11532 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
11533 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
11542 template<
typename MT1,
typename MT2 >
11547 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
11556 template<
typename MT1,
typename MT2 >
11561 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1649
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:310
Constraint on the data type.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:484
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:292
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:483
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:430
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:142
Header file for the And class template.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:259
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the IsUniLower type trait.
Header file for the IsFloat type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:454
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:400
Constraint on the data type.
const size_t TDMATDMATMULT_THRESHOLD
Column-major dense matrix/row-major dense matrix multiplication threshold.This setting specifies the ...
Definition: Thresholds.h:159
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:125
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:291
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:410
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > > >::Type store(T *address, const sse_int16_t &value)
Aligned store of a vector of 2-byte integral values.
Definition: Store.h:80
Header file for the TDMatSVecMultExprTrait class template.
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1602
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:474
Header file for the DenseMatrix base class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:332
const size_t SMP_TDMATDMATMULT_THRESHOLD
SMP column-major dense matrix/row-major dense matrix multiplication threshold.This threshold specifie...
Definition: Thresholds.h:880
Header file for the Not class template.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:304
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDMatMultExpr.h:296
Header file for the IsLower type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
Header file for BLAS level 3 functions.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:297
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type load(const T *address)
Loads a vector of 2-byte integral values.
Definition: Load.h:79
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:293
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:144
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:150
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:442
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:139
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:420
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:307
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:298
Header file for the HasMutableDataAccess type trait.
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, sse_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:283
Header file for the IsDenseVector type trait.
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:301
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:464
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:140
Header file for the IsRowMajorMatrix type trait.
Header file for the IsComputation type trait class.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:143
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:141
Header file for the TDMatDVecMultExprTrait class template.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:347
Header file for the complex data type.
Header file for the IsUpper type trait.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:295
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:294
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849