35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
140 template<
typename MT1
142 class TDMatTDMatMultExpr :
public DenseMatrix< TDMatTDMatMultExpr<MT1,MT2>, true >
143 ,
private MatMatMultExpr
144 ,
private Computation
174 template<
typename T1,
typename T2,
typename T3 >
175 struct CanExploitSymmetry {
176 enum { value = IsRowMajorMatrix<T1>::value &&
177 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
188 template<
typename T1,
typename T2,
typename T3 >
189 struct IsEvaluationRequired {
190 enum { value = ( evaluateLeft || evaluateRight ) &&
191 CanExploitSymmetry<T1,T2,T3>::value };
201 template<
typename T1,
typename T2,
typename T3 >
202 struct UseBlasKernel {
204 HasMutableDataAccess<T1>::value &&
205 HasConstDataAccess<T2>::value &&
206 HasConstDataAccess<T3>::value &&
207 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
208 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
209 IsBlasCompatible<typename T1::ElementType>::value &&
210 IsBlasCompatible<typename T2::ElementType>::value &&
211 IsBlasCompatible<typename T3::ElementType>::value &&
212 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
213 IsSame< typename T1::ElementType, typename T3::ElementType >::value };
223 template<
typename T1,
typename T2,
typename T3 >
224 struct UseVectorizedDefaultKernel {
226 !IsDiagonal<T2>::value &&
227 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
228 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
229 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
230 IntrinsicTrait<typename T1::ElementType>::addition &&
231 IntrinsicTrait<typename T1::ElementType>::subtraction &&
232 IntrinsicTrait<typename T1::ElementType>::multiplication };
264 MT1::vectorizable && MT2::vectorizable &&
270 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
271 !evaluateRight && MT2::smpAssignable };
314 :(
lhs_.columns() ) ) );
316 if(
lhs_.columns() == 0UL ||
326 const size_t knum( kend - kbegin );
327 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
329 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
331 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
333 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
351 inline ReturnType
at(
size_t i,
size_t j )
const {
352 if( i >=
lhs_.rows() ) {
355 if( j >=
rhs_.columns() ) {
378 return rhs_.columns();
408 template<
typename T >
410 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
420 template<
typename T >
422 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
432 return lhs_.isAligned() &&
rhs_.isAligned();
443 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
444 (
columns() > SMP_TDMATTDMATMULT_THRESHOLD );
467 template<
typename MT
477 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
480 else if( rhs.lhs_.columns() == 0UL ) {
485 LT A(
serial( rhs.lhs_ ) );
486 RT B(
serial( rhs.rhs_ ) );
495 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
511 template<
typename MT3
514 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
517 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
518 selectSmallAssignKernel( C, A, B );
520 selectBlasAssignKernel( C, A, B );
539 template<
typename MT3
542 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
543 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
545 const size_t M( A.rows() );
546 const size_t N( B.columns() );
547 const size_t K( A.columns() );
549 for(
size_t j=0UL; j<N; ++j )
551 const size_t kbegin( ( IsLower<MT5>::value )
552 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
554 const size_t kend( ( IsUpper<MT5>::value )
555 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
559 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
560 for(
size_t i=0UL; i<M; ++i ) {
567 const size_t ibegin( ( IsLower<MT4>::value )
568 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
570 const size_t iend( ( IsUpper<MT4>::value )
571 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
575 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
576 for(
size_t i=0UL; i<ibegin; ++i ) {
580 else if( IsStrictlyLower<MT4>::value ) {
583 for(
size_t i=ibegin; i<iend; ++i ) {
584 C(i,j) = A(i,kbegin) * B(kbegin,j);
586 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
587 for(
size_t i=iend; i<M; ++i ) {
591 else if( IsStrictlyUpper<MT4>::value ) {
596 for(
size_t k=kbegin+1UL; k<kend; ++k )
598 const size_t ibegin( ( IsLower<MT4>::value )
599 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
601 const size_t iend( ( IsUpper<MT4>::value )
602 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
606 for(
size_t i=ibegin; i<iend; ++i ) {
607 C(i,j) += A(i,k) * B(k,j);
609 if( IsUpper<MT4>::value ) {
610 C(iend,j) = A(iend,k) * B(k,j);
632 template<
typename MT3
635 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
636 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
640 const size_t M( A.rows() );
641 const size_t N( B.columns() );
643 for(
size_t j=0UL; j<N; ++j )
645 const size_t ibegin( ( IsLower<MT4>::value )
646 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
648 const size_t iend( ( IsUpper<MT4>::value )
649 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
653 if( IsLower<MT4>::value ) {
654 for(
size_t i=0UL; i<ibegin; ++i ) {
658 for(
size_t i=ibegin; i<iend; ++i ) {
659 C(i,j) = A(i,j) * B(j,j);
661 if( IsUpper<MT4>::value ) {
662 for(
size_t i=iend; i<M; ++i ) {
685 template<
typename MT3
688 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
689 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
693 const size_t M( A.rows() );
694 const size_t N( B.columns() );
696 for(
size_t j=0UL; j<N; ++j )
698 const size_t ibegin( ( IsLower<MT5>::value )
699 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
701 const size_t iend( ( IsUpper<MT5>::value )
702 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
706 if( IsLower<MT4>::value ) {
707 for(
size_t i=0UL; i<ibegin; ++i ) {
711 for(
size_t i=ibegin; i<iend; ++i ) {
712 C(i,j) = A(i,i) * B(i,j);
714 if( IsUpper<MT4>::value ) {
715 for(
size_t i=iend; i<M; ++i ) {
738 template<
typename MT3
741 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
742 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
748 for(
size_t i=0UL; i<A.rows(); ++i ) {
749 C(i,i) = A(i,i) * B(i,i);
769 template<
typename MT3
772 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
773 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
775 selectDefaultAssignKernel( C, A, B );
795 template<
typename MT3
798 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
799 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
806 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
808 assign( ~C, A * tmp );
810 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
812 assign( ~C, tmp * B );
814 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
816 assign( ~C, A * tmp );
820 assign( ~C, tmp * B );
841 template<
typename MT3
844 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
845 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
847 typedef IntrinsicTrait<ElementType> IT;
849 const size_t M( A.rows() );
850 const size_t N( B.columns() );
851 const size_t K( A.columns() );
853 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
855 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
861 for(
size_t j=0UL; j<N; ++j )
863 const size_t kbegin( ( IsLower<MT5>::value )
864 ?( ( IsUpper<MT4>::value )
865 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
866 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
867 :( IsUpper<MT4>::value ? i : 0UL ) );
868 const size_t kend( ( IsUpper<MT5>::value )
869 ?( ( IsLower<MT4>::value )
870 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
871 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
872 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
874 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
876 for(
size_t k=kbegin; k<kend; ++k ) {
877 const IntrinsicType b1(
set( B(k,j) ) );
878 xmm1 = xmm1 + A.load(i ,k) * b1;
879 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
880 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
881 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
882 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
883 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
884 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
885 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
888 (~C).store( i , j, xmm1 );
890 (~C).store( i+
IT::size*2UL, j, xmm3 );
891 (~C).store( i+
IT::size*3UL, j, xmm4 );
892 (~C).store( i+
IT::size*4UL, j, xmm5 );
893 (~C).store( i+
IT::size*5UL, j, xmm6 );
894 (~C).store( i+
IT::size*6UL, j, xmm7 );
895 (~C).store( i+
IT::size*7UL, j, xmm8 );
903 for( ; (j+2UL) <= N; j+=2UL )
905 const size_t kbegin( ( IsLower<MT5>::value )
906 ?( ( IsUpper<MT4>::value )
907 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
908 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
909 :( IsUpper<MT4>::value ? i : 0UL ) );
910 const size_t kend( ( IsUpper<MT5>::value )
911 ?( ( IsLower<MT4>::value )
912 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
913 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
914 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
916 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
918 for(
size_t k=kbegin; k<kend; ++k ) {
919 const IntrinsicType a1( A.load(i ,k) );
920 const IntrinsicType a2( A.load(i+
IT::size ,k) );
921 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
922 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
923 const IntrinsicType b1(
set( B(k,j ) ) );
924 const IntrinsicType b2(
set( B(k,j+1UL) ) );
925 xmm1 = xmm1 + a1 * b1;
926 xmm2 = xmm2 + a2 * b1;
927 xmm3 = xmm3 + a3 * b1;
928 xmm4 = xmm4 + a4 * b1;
929 xmm5 = xmm5 + a1 * b2;
930 xmm6 = xmm6 + a2 * b2;
931 xmm7 = xmm7 + a3 * b2;
932 xmm8 = xmm8 + a4 * b2;
935 (~C).store( i , j , xmm1 );
936 (~C).store( i+
IT::size , j , xmm2 );
937 (~C).store( i+
IT::size*2UL, j , xmm3 );
938 (~C).store( i+
IT::size*3UL, j , xmm4 );
939 (~C).store( i , j+1UL, xmm5 );
940 (~C).store( i+
IT::size , j+1UL, xmm6 );
941 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
942 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
947 const size_t kbegin( ( IsLower<MT5>::value )
948 ?( ( IsUpper<MT4>::value )
949 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
950 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
951 :( IsUpper<MT4>::value ? i : 0UL ) );
952 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
954 IntrinsicType xmm1, xmm2, xmm3, xmm4;
956 for(
size_t k=kbegin; k<kend; ++k ) {
957 const IntrinsicType b1(
set( B(k,j) ) );
958 xmm1 = xmm1 + A.load(i ,k) * b1;
959 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
960 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
961 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
964 (~C).store( i , j, xmm1 );
966 (~C).store( i+
IT::size*2UL, j, xmm3 );
967 (~C).store( i+
IT::size*3UL, j, xmm4 );
975 for( ; (j+2UL) <= N; j+=2UL )
977 const size_t kbegin( ( IsLower<MT5>::value )
978 ?( ( IsUpper<MT4>::value )
979 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
980 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
981 :( IsUpper<MT4>::value ? i : 0UL ) );
982 const size_t kend( ( IsUpper<MT5>::value )
983 ?( ( IsLower<MT4>::value )
984 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
985 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
986 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
988 IntrinsicType xmm1, xmm2, xmm3, xmm4;
990 for(
size_t k=kbegin; k<kend; ++k ) {
991 const IntrinsicType a1( A.load(i ,k) );
992 const IntrinsicType a2( A.load(i+
IT::size,k) );
993 const IntrinsicType b1(
set( B(k,j ) ) );
994 const IntrinsicType b2(
set( B(k,j+1UL) ) );
995 xmm1 = xmm1 + a1 * b1;
996 xmm2 = xmm2 + a2 * b1;
997 xmm3 = xmm3 + a1 * b2;
998 xmm4 = xmm4 + a2 * b2;
1001 (~C).store( i , j , xmm1 );
1002 (~C).store( i+
IT::size, j , xmm2 );
1003 (~C).store( i , j+1UL, xmm3 );
1004 (~C).store( i+
IT::size, j+1UL, xmm4 );
1009 const size_t kbegin( ( IsLower<MT5>::value )
1010 ?( ( IsUpper<MT4>::value )
1011 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1012 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1013 :( IsUpper<MT4>::value ? i : 0UL ) );
1014 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
1016 IntrinsicType xmm1, xmm2;
1018 for(
size_t k=kbegin; k<kend; ++k ) {
1019 const IntrinsicType b1(
set( B(k,j) ) );
1020 xmm1 = xmm1 + A.load(i ,k) * b1;
1021 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
1024 (~C).store( i , j, xmm1 );
1033 for( ; (j+2UL) <= N; j+=2UL )
1035 const size_t kbegin( ( IsLower<MT5>::value )
1036 ?( ( IsUpper<MT4>::value )
1037 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1038 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1039 :( IsUpper<MT4>::value ? i : 0UL ) );
1040 const size_t kend( ( IsUpper<MT5>::value )
1041 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1044 IntrinsicType xmm1, xmm2;
1046 for(
size_t k=kbegin; k<kend; ++k ) {
1047 const IntrinsicType a1( A.load(i,k) );
1048 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1049 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1052 (~C).store( i, j , xmm1 );
1053 (~C).store( i, j+1UL, xmm2 );
1058 const size_t kbegin( ( IsLower<MT5>::value )
1059 ?( ( IsUpper<MT4>::value )
1060 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1061 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1062 :( IsUpper<MT4>::value ? i : 0UL ) );
1066 for(
size_t k=kbegin; k<K; ++k ) {
1067 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1070 (~C).store( i, j, xmm1 );
1074 for( ; remainder && i<M; ++i )
1078 for( ; (j+2UL) <= N; j+=2UL )
1080 const size_t kbegin( ( IsLower<MT5>::value )
1081 ?( ( IsUpper<MT4>::value )
1082 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1083 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1084 :( IsUpper<MT4>::value ? i : 0UL ) );
1085 const size_t kend( ( IsUpper<MT5>::value )
1086 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1092 for(
size_t k=kbegin; k<kend; ++k ) {
1093 value1 += A(i,k) * B(k,j );
1094 value2 += A(i,k) * B(k,j+1UL);
1097 (~C)(i,j ) = value1;
1098 (~C)(i,j+1UL) = value2;
1103 const size_t kbegin( ( IsLower<MT5>::value )
1104 ?( ( IsUpper<MT4>::value )
1105 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1106 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1107 :( IsUpper<MT4>::value ? i : 0UL ) );
1111 for(
size_t k=kbegin; k<K; ++k ) {
1112 value += A(i,k) * B(k,j);
1136 template<
typename MT3
1139 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1140 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1142 selectDefaultAssignKernel( C, A, B );
1162 template<
typename MT3
1165 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1166 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1168 selectSmallAssignKernel( ~C, A, B );
1188 template<
typename MT3
1191 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1192 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1194 typedef IntrinsicTrait<ElementType> IT;
1196 const size_t M( A.rows() );
1197 const size_t N( B.columns() );
1198 const size_t K( A.columns() );
1200 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1202 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
1204 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
1206 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1209 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
1211 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
1213 for(
size_t j=jj; j<jend; ++j ) {
1214 for(
size_t i=ii; i<iend; ++i ) {
1219 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
1221 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
1233 for( ; (j+2UL) <= jend; j+=2UL )
1235 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1236 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1237 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1238 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1240 IntrinsicType xmm1( (~C).load(i ,j ) );
1241 IntrinsicType xmm2( (~C).load(i1,j ) );
1242 IntrinsicType xmm3( (~C).load(i2,j ) );
1243 IntrinsicType xmm4( (~C).load(i3,j ) );
1244 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
1245 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
1246 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
1247 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
1249 for(
size_t k=kbegin; k<kend; ++k ) {
1250 const IntrinsicType a1( A.load(i ,k) );
1251 const IntrinsicType a2( A.load(i1,k) );
1252 const IntrinsicType a3( A.load(i2,k) );
1253 const IntrinsicType a4( A.load(i3,k) );
1254 const IntrinsicType b1(
set( B(k,j ) ) );
1255 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1256 xmm1 = xmm1 + a1 * b1;
1257 xmm2 = xmm2 + a2 * b1;
1258 xmm3 = xmm3 + a3 * b1;
1259 xmm4 = xmm4 + a4 * b1;
1260 xmm5 = xmm5 + a1 * b2;
1261 xmm6 = xmm6 + a2 * b2;
1262 xmm7 = xmm7 + a3 * b2;
1263 xmm8 = xmm8 + a4 * b2;
1266 (~C).store( i , j , xmm1 );
1267 (~C).store( i1, j , xmm2 );
1268 (~C).store( i2, j , xmm3 );
1269 (~C).store( i3, j , xmm4 );
1270 (~C).store( i , j+1UL, xmm5 );
1271 (~C).store( i1, j+1UL, xmm6 );
1272 (~C).store( i2, j+1UL, xmm7 );
1273 (~C).store( i3, j+1UL, xmm8 );
1278 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1279 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1280 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
1281 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1283 IntrinsicType xmm1( (~C).load(i ,j) );
1284 IntrinsicType xmm2( (~C).load(i1,j) );
1285 IntrinsicType xmm3( (~C).load(i2,j) );
1286 IntrinsicType xmm4( (~C).load(i3,j) );
1288 for(
size_t k=kbegin; k<kend; ++k ) {
1289 const IntrinsicType b1(
set( B(k,j) ) );
1290 xmm1 = xmm1 + A.load(i ,k) * b1;
1291 xmm2 = xmm2 + A.load(i1,k) * b1;
1292 xmm3 = xmm3 + A.load(i2,k) * b1;
1293 xmm4 = xmm4 + A.load(i3,k) * b1;
1296 (~C).store( i , j, xmm1 );
1297 (~C).store( i1, j, xmm2 );
1298 (~C).store( i2, j, xmm3 );
1299 (~C).store( i3, j, xmm4 );
1309 for( ; (j+4UL) <= jend; j+=4UL )
1311 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1312 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1313 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1314 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1316 IntrinsicType xmm1( (~C).load(i ,j ) );
1317 IntrinsicType xmm2( (~C).load(i1,j ) );
1318 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1319 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1320 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
1321 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
1322 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
1323 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
1325 for(
size_t k=kbegin; k<kend; ++k ) {
1326 const IntrinsicType a1( A.load(i ,k) );
1327 const IntrinsicType a2( A.load(i1,k) );
1328 const IntrinsicType b1(
set( B(k,j ) ) );
1329 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1330 const IntrinsicType b3(
set( B(k,j+2UL) ) );
1331 const IntrinsicType b4(
set( B(k,j+3UL) ) );
1332 xmm1 = xmm1 + a1 * b1;
1333 xmm2 = xmm2 + a2 * b1;
1334 xmm3 = xmm3 + a1 * b2;
1335 xmm4 = xmm4 + a2 * b2;
1336 xmm5 = xmm5 + a1 * b3;
1337 xmm6 = xmm6 + a2 * b3;
1338 xmm7 = xmm7 + a1 * b4;
1339 xmm8 = xmm8 + a2 * b4;
1342 (~C).store( i , j , xmm1 );
1343 (~C).store( i1, j , xmm2 );
1344 (~C).store( i , j+1UL, xmm3 );
1345 (~C).store( i1, j+1UL, xmm4 );
1346 (~C).store( i , j+2UL, xmm5 );
1347 (~C).store( i1, j+2UL, xmm6 );
1348 (~C).store( i , j+3UL, xmm7 );
1349 (~C).store( i1, j+3UL, xmm8 );
1352 for( ; (j+2UL) <= jend; j+=2UL )
1354 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1355 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1356 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1357 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1359 IntrinsicType xmm1( (~C).load(i ,j ) );
1360 IntrinsicType xmm2( (~C).load(i1,j ) );
1361 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
1362 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
1364 for(
size_t k=kbegin; k<kend; ++k ) {
1365 const IntrinsicType a1( A.load(i ,k) );
1366 const IntrinsicType a2( A.load(i1,k) );
1367 const IntrinsicType b1(
set( B(k,j ) ) );
1368 const IntrinsicType b2(
set( B(k,j+1UL) ) );
1369 xmm1 = xmm1 + a1 * b1;
1370 xmm2 = xmm2 + a2 * b1;
1371 xmm3 = xmm3 + a1 * b2;
1372 xmm4 = xmm4 + a2 * b2;
1375 (~C).store( i , j , xmm1 );
1376 (~C).store( i1, j , xmm2 );
1377 (~C).store( i , j+1UL, xmm3 );
1378 (~C).store( i1, j+1UL, xmm4 );
1383 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1384 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1385 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
1386 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1388 IntrinsicType xmm1( (~C).load(i ,j) );
1389 IntrinsicType xmm2( (~C).load(i1,j) );
1391 for(
size_t k=kbegin; k<kend; ++k ) {
1392 const IntrinsicType b1(
set( B(k,j) ) );
1393 xmm1 = xmm1 + A.load(i ,k) * b1;
1394 xmm2 = xmm2 + A.load(i1,k) * b1;
1397 (~C).store( i , j, xmm1 );
1398 (~C).store( i1, j, xmm2 );
1404 for(
size_t j=jj; j<jend; ++j )
1406 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1407 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1408 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
1409 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1411 IntrinsicType xmm1( (~C).load(i,j) );
1413 for(
size_t k=kbegin; k<kend; ++k ) {
1414 const IntrinsicType b1(
set( B(k,j) ) );
1415 xmm1 = xmm1 + A.load(i,k) * b1;
1418 (~C).store( i, j, xmm1 );
1422 for( ; remainder && i<iend; ++i )
1424 for(
size_t j=jj; j<jend; ++j )
1426 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1427 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1428 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
1429 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1431 ElementType value( (~C)(i,j) );
1433 for(
size_t k=kbegin; k<kend; ++k ) {
1434 value += A(i,k) * B(k,j);
1461 template<
typename MT3
1464 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1465 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1467 selectLargeAssignKernel( C, A, B );
1487 template<
typename MT3
1490 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1491 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1495 if( IsTriangular<MT4>::value ) {
1497 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1499 else if( IsTriangular<MT5>::value ) {
1501 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1504 gemm( C, A, B, ET(1), ET(0) );
1524 template<
typename MT
1526 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1531 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
1543 const TmpType tmp(
serial( rhs ) );
1544 assign( ~lhs, tmp );
1564 template<
typename MT >
1565 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1575 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1576 assign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
1577 else if( IsSymmetric<MT1>::value )
1578 assign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
1580 assign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
1598 template<
typename MT
1600 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1608 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1612 LT A(
serial( rhs.lhs_ ) );
1613 RT B(
serial( rhs.rhs_ ) );
1622 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1638 template<
typename MT3
1641 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1643 if( ( IsDiagonal<MT4>::value ) ||
1644 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1645 selectSmallAddAssignKernel( C, A, B );
1647 selectBlasAddAssignKernel( C, A, B );
1666 template<
typename MT3
1669 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1670 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1672 const size_t M( A.rows() );
1673 const size_t N( B.columns() );
1674 const size_t K( A.columns() );
1676 for(
size_t j=0UL; j<N; ++j )
1678 const size_t kbegin( ( IsLower<MT5>::value )
1679 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1681 const size_t kend( ( IsUpper<MT5>::value )
1682 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1686 for(
size_t k=kbegin; k<kend; ++k )
1688 const size_t ibegin( ( IsLower<MT4>::value )
1689 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
1691 const size_t iend( ( IsUpper<MT4>::value )
1692 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
1696 const size_t inum( iend - ibegin );
1697 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1699 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1700 C(i ,j) += A(i ,k) * B(k,j);
1701 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1704 C(ipos,j) += A(ipos,k) * B(k,j);
1726 template<
typename MT3
1729 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1730 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1734 const size_t M( A.rows() );
1735 const size_t N( B.columns() );
1737 for(
size_t j=0UL; j<N; ++j )
1739 const size_t ibegin( ( IsLower<MT4>::value )
1740 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
1742 const size_t iend( ( IsUpper<MT4>::value )
1743 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
1747 const size_t inum( iend - ibegin );
1748 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1750 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1751 C(i ,j) += A(i ,j) * B(j,j);
1752 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1755 C(ipos,j) += A(ipos,j) * B(j,j);
1776 template<
typename MT3
1779 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1780 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1784 const size_t M( A.rows() );
1785 const size_t N( B.columns() );
1787 for(
size_t j=0UL; j<N; ++j )
1789 const size_t ibegin( ( IsLower<MT5>::value )
1790 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
1792 const size_t iend( ( IsUpper<MT5>::value )
1793 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
1797 const size_t inum( iend - ibegin );
1798 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1800 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1801 C(i ,j) += A(i ,i ) * B(i ,j);
1802 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
1805 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
1826 template<
typename MT3
1829 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
1830 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1834 for(
size_t i=0UL; i<A.rows(); ++i ) {
1835 C(i,i) += A(i,i) * B(i,i);
1855 template<
typename MT3
1858 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1859 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1861 selectDefaultAddAssignKernel( C, A, B );
1881 template<
typename MT3
1884 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1885 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1892 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1894 addAssign( ~C, A * tmp );
1896 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1898 addAssign( ~C, tmp * B );
1900 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
1902 addAssign( ~C, A * tmp );
1906 addAssign( ~C, tmp * B );
1927 template<
typename MT3
1930 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1931 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1933 typedef IntrinsicTrait<ElementType> IT;
1935 const size_t M( A.rows() );
1936 const size_t N( B.columns() );
1937 const size_t K( A.columns() );
1939 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1941 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
1947 for(
size_t j=0UL; j<N; ++j )
1949 const size_t kbegin( ( IsLower<MT5>::value )
1950 ?( ( IsUpper<MT4>::value )
1951 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1952 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1953 :( IsUpper<MT4>::value ? i : 0UL ) );
1954 const size_t kend( ( IsUpper<MT5>::value )
1955 ?( ( IsLower<MT4>::value )
1956 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1957 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1958 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
1960 IntrinsicType xmm1( (~C).load(i ,j) );
1961 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
1962 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
1963 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
1964 IntrinsicType xmm5( (~C).load(i+
IT::size*4UL,j) );
1965 IntrinsicType xmm6( (~C).load(i+
IT::size*5UL,j) );
1966 IntrinsicType xmm7( (~C).load(i+
IT::size*6UL,j) );
1967 IntrinsicType xmm8( (~C).load(i+
IT::size*7UL,j) );
1969 for(
size_t k=kbegin; k<kend; ++k ) {
1970 const IntrinsicType b1(
set( B(k,j) ) );
1971 xmm1 = xmm1 + A.load(i ,k) * b1;
1972 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
1973 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
1974 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
1975 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
1976 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
1977 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
1978 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
1981 (~C).store( i , j, xmm1 );
1982 (~C).store( i+
IT::size , j, xmm2 );
1983 (~C).store( i+
IT::size*2UL, j, xmm3 );
1984 (~C).store( i+
IT::size*3UL, j, xmm4 );
1985 (~C).store( i+
IT::size*4UL, j, xmm5 );
1986 (~C).store( i+
IT::size*5UL, j, xmm6 );
1987 (~C).store( i+
IT::size*6UL, j, xmm7 );
1988 (~C).store( i+
IT::size*7UL, j, xmm8 );
1996 for( ; (j+2UL) <= N; j+=2UL )
1998 const size_t kbegin( ( IsLower<MT5>::value )
1999 ?( ( IsUpper<MT4>::value )
2000 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2001 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2002 :( IsUpper<MT4>::value ? i : 0UL ) );
2003 const size_t kend( ( IsUpper<MT5>::value )
2004 ?( ( IsLower<MT4>::value )
2005 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2006 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2007 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
2009 IntrinsicType xmm1( (~C).load(i ,j ) );
2010 IntrinsicType xmm2( (~C).load(i+
IT::size ,j ) );
2011 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j ) );
2012 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j ) );
2013 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2014 IntrinsicType xmm6( (~C).load(i+
IT::size ,j+1UL) );
2015 IntrinsicType xmm7( (~C).load(i+
IT::size*2UL,j+1UL) );
2016 IntrinsicType xmm8( (~C).load(i+
IT::size*3UL,j+1UL) );
2018 for(
size_t k=kbegin; k<kend; ++k ) {
2019 const IntrinsicType a1( A.load(i ,k) );
2020 const IntrinsicType a2( A.load(i+
IT::size ,k) );
2021 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
2022 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
2023 const IntrinsicType b1(
set( B(k,j ) ) );
2024 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2025 xmm1 = xmm1 + a1 * b1;
2026 xmm2 = xmm2 + a2 * b1;
2027 xmm3 = xmm3 + a3 * b1;
2028 xmm4 = xmm4 + a4 * b1;
2029 xmm5 = xmm5 + a1 * b2;
2030 xmm6 = xmm6 + a2 * b2;
2031 xmm7 = xmm7 + a3 * b2;
2032 xmm8 = xmm8 + a4 * b2;
2035 (~C).store( i , j , xmm1 );
2036 (~C).store( i+
IT::size , j , xmm2 );
2037 (~C).store( i+
IT::size*2UL, j , xmm3 );
2038 (~C).store( i+
IT::size*3UL, j , xmm4 );
2039 (~C).store( i , j+1UL, xmm5 );
2040 (~C).store( i+
IT::size , j+1UL, xmm6 );
2041 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
2042 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
2047 const size_t kbegin( ( IsLower<MT5>::value )
2048 ?( ( IsUpper<MT4>::value )
2049 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2050 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2051 :( IsUpper<MT4>::value ? i : 0UL ) );
2052 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
2054 IntrinsicType xmm1( (~C).load(i ,j) );
2055 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
2056 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
2057 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
2059 for(
size_t k=kbegin; k<kend; ++k ) {
2060 const IntrinsicType b1(
set( B(k,j) ) );
2061 xmm1 = xmm1 + A.load(i ,k) * b1;
2062 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
2063 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
2064 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
2067 (~C).store( i , j, xmm1 );
2068 (~C).store( i+
IT::size , j, xmm2 );
2069 (~C).store( i+
IT::size*2UL, j, xmm3 );
2070 (~C).store( i+
IT::size*3UL, j, xmm4 );
2078 for( ; (j+2UL) <= N; j+=2UL )
2080 const size_t kbegin( ( IsLower<MT5>::value )
2081 ?( ( IsUpper<MT4>::value )
2082 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2083 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2084 :( IsUpper<MT4>::value ? i : 0UL ) );
2085 const size_t kend( ( IsUpper<MT5>::value )
2086 ?( ( IsLower<MT4>::value )
2087 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
2088 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
2089 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
2091 IntrinsicType xmm1( (~C).load(i ,j ) );
2092 IntrinsicType xmm2( (~C).load(i+
IT::size,j ) );
2093 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2094 IntrinsicType xmm4( (~C).load(i+
IT::size,j+1UL) );
2096 for(
size_t k=kbegin; k<kend; ++k ) {
2097 const IntrinsicType a1( A.load(i ,k) );
2098 const IntrinsicType a2( A.load(i+
IT::size,k) );
2099 const IntrinsicType b1(
set( B(k,j ) ) );
2100 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2101 xmm1 = xmm1 + a1 * b1;
2102 xmm2 = xmm2 + a2 * b1;
2103 xmm3 = xmm3 + a1 * b2;
2104 xmm4 = xmm4 + a2 * b2;
2107 (~C).store( i , j , xmm1 );
2108 (~C).store( i+
IT::size, j , xmm2 );
2109 (~C).store( i , j+1UL, xmm3 );
2110 (~C).store( i+
IT::size, j+1UL, xmm4 );
2115 const size_t kbegin( ( IsLower<MT5>::value )
2116 ?( ( IsUpper<MT4>::value )
2117 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2118 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2119 :( IsUpper<MT4>::value ? i : 0UL ) );
2120 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
2122 IntrinsicType xmm1( (~C).load(i ,j) );
2123 IntrinsicType xmm2( (~C).load(i+
IT::size,j) );
2125 for(
size_t k=kbegin; k<kend; ++k ) {
2126 const IntrinsicType b1(
set( B(k,j) ) );
2127 xmm1 = xmm1 + A.load(i ,k) * b1;
2128 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
2131 (~C).store( i , j, xmm1 );
2140 for( ; (j+2UL) <= N; j+=2UL )
2142 const size_t kbegin( ( IsLower<MT5>::value )
2143 ?( ( IsUpper<MT4>::value )
2144 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2145 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2146 :( IsUpper<MT4>::value ? i : 0UL ) );
2147 const size_t kend( ( IsUpper<MT5>::value )
2148 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2151 IntrinsicType xmm1( (~C).load(i,j ) );
2152 IntrinsicType xmm2( (~C).load(i,j+1UL) );
2154 for(
size_t k=kbegin; k<kend; ++k ) {
2155 const IntrinsicType a1( A.load(i,k) );
2156 xmm1 = xmm1 + a1 *
set( B(k,j ) );
2157 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
2160 (~C).store( i, j , xmm1 );
2161 (~C).store( i, j+1UL, xmm2 );
2166 const size_t kbegin( ( IsLower<MT5>::value )
2167 ?( ( IsUpper<MT4>::value )
2168 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2169 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2170 :( IsUpper<MT4>::value ? i : 0UL ) );
2172 IntrinsicType xmm1( (~C).load(i,j) );
2174 for(
size_t k=kbegin; k<K; ++k ) {
2175 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
2178 (~C).store( i, j, xmm1 );
2182 for( ; remainder && i<M; ++i )
2186 for( ; (j+2UL) <= N; j+=2UL )
2188 const size_t kbegin( ( IsLower<MT5>::value )
2189 ?( ( IsUpper<MT4>::value )
2190 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2191 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2192 :( IsUpper<MT4>::value ? i : 0UL ) );
2193 const size_t kend( ( IsUpper<MT5>::value )
2194 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
2197 ElementType value1( (~C)(i,j ) );
2198 ElementType value2( (~C)(i,j+1UL) );
2200 for(
size_t k=kbegin; k<kend; ++k ) {
2201 value1 += A(i,k) * B(k,j );
2202 value2 += A(i,k) * B(k,j+1UL);
2205 (~C)(i,j ) = value1;
2206 (~C)(i,j+1UL) = value2;
2211 const size_t kbegin( ( IsLower<MT5>::value )
2212 ?( ( IsUpper<MT4>::value )
2213 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
2214 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
2215 :( IsUpper<MT4>::value ? i : 0UL ) );
2217 ElementType value( (~C)(i,j) );
2219 for(
size_t k=kbegin; k<K; ++k ) {
2220 value += A(i,k) * B(k,j);
2244 template<
typename MT3
2247 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2248 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2250 selectDefaultAddAssignKernel( C, A, B );
2270 template<
typename MT3
2273 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2274 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2276 selectSmallAddAssignKernel( ~C, A, B );
2296 template<
typename MT3
2299 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2300 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2302 typedef IntrinsicTrait<ElementType> IT;
2304 const size_t M( A.rows() );
2305 const size_t N( B.columns() );
2306 const size_t K( A.columns() );
2308 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
2310 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
2312 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
2314 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2317 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
2319 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
2321 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
2323 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
2335 for( ; (j+2UL) <= jend; j+=2UL )
2337 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2338 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2339 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
2340 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2342 IntrinsicType xmm1( (~C).load(i ,j ) );
2343 IntrinsicType xmm2( (~C).load(i1,j ) );
2344 IntrinsicType xmm3( (~C).load(i2,j ) );
2345 IntrinsicType xmm4( (~C).load(i3,j ) );
2346 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
2347 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
2348 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
2349 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
2351 for(
size_t k=kbegin; k<kend; ++k ) {
2352 const IntrinsicType a1( A.load(i ,k) );
2353 const IntrinsicType a2( A.load(i1,k) );
2354 const IntrinsicType a3( A.load(i2,k) );
2355 const IntrinsicType a4( A.load(i3,k) );
2356 const IntrinsicType b1(
set( B(k,j ) ) );
2357 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2358 xmm1 = xmm1 + a1 * b1;
2359 xmm2 = xmm2 + a2 * b1;
2360 xmm3 = xmm3 + a3 * b1;
2361 xmm4 = xmm4 + a4 * b1;
2362 xmm5 = xmm5 + a1 * b2;
2363 xmm6 = xmm6 + a2 * b2;
2364 xmm7 = xmm7 + a3 * b2;
2365 xmm8 = xmm8 + a4 * b2;
2368 (~C).store( i , j , xmm1 );
2369 (~C).store( i1, j , xmm2 );
2370 (~C).store( i2, j , xmm3 );
2371 (~C).store( i3, j , xmm4 );
2372 (~C).store( i , j+1UL, xmm5 );
2373 (~C).store( i1, j+1UL, xmm6 );
2374 (~C).store( i2, j+1UL, xmm7 );
2375 (~C).store( i3, j+1UL, xmm8 );
2380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2382 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
2383 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2385 IntrinsicType xmm1( (~C).load(i ,j) );
2386 IntrinsicType xmm2( (~C).load(i1,j) );
2387 IntrinsicType xmm3( (~C).load(i2,j) );
2388 IntrinsicType xmm4( (~C).load(i3,j) );
2390 for(
size_t k=kbegin; k<kend; ++k ) {
2391 const IntrinsicType b1(
set( B(k,j) ) );
2392 xmm1 = xmm1 + A.load(i ,k) * b1;
2393 xmm2 = xmm2 + A.load(i1,k) * b1;
2394 xmm3 = xmm3 + A.load(i2,k) * b1;
2395 xmm4 = xmm4 + A.load(i3,k) * b1;
2398 (~C).store( i , j, xmm1 );
2399 (~C).store( i1, j, xmm2 );
2400 (~C).store( i2, j, xmm3 );
2401 (~C).store( i3, j, xmm4 );
2411 for( ; (j+4UL) <= jend; j+=4UL )
2413 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2414 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2415 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2416 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
2418 IntrinsicType xmm1( (~C).load(i ,j ) );
2419 IntrinsicType xmm2( (~C).load(i1,j ) );
2420 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2421 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2422 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
2423 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
2424 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
2425 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
2427 for(
size_t k=kbegin; k<kend; ++k ) {
2428 const IntrinsicType a1( A.load(i ,k) );
2429 const IntrinsicType a2( A.load(i1,k) );
2430 const IntrinsicType b1(
set( B(k,j ) ) );
2431 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2432 const IntrinsicType b3(
set( B(k,j+2UL) ) );
2433 const IntrinsicType b4(
set( B(k,j+3UL) ) );
2434 xmm1 = xmm1 + a1 * b1;
2435 xmm2 = xmm2 + a2 * b1;
2436 xmm3 = xmm3 + a1 * b2;
2437 xmm4 = xmm4 + a2 * b2;
2438 xmm5 = xmm5 + a1 * b3;
2439 xmm6 = xmm6 + a2 * b3;
2440 xmm7 = xmm7 + a1 * b4;
2441 xmm8 = xmm8 + a2 * b4;
2444 (~C).store( i , j , xmm1 );
2445 (~C).store( i1, j , xmm2 );
2446 (~C).store( i , j+1UL, xmm3 );
2447 (~C).store( i1, j+1UL, xmm4 );
2448 (~C).store( i , j+2UL, xmm5 );
2449 (~C).store( i1, j+2UL, xmm6 );
2450 (~C).store( i , j+3UL, xmm7 );
2451 (~C).store( i1, j+3UL, xmm8 );
2454 for( ; (j+2UL) <= jend; j+=2UL )
2456 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2457 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2458 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2459 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2461 IntrinsicType xmm1( (~C).load(i ,j ) );
2462 IntrinsicType xmm2( (~C).load(i1,j ) );
2463 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
2464 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
2466 for(
size_t k=kbegin; k<kend; ++k ) {
2467 const IntrinsicType a1( A.load(i ,k) );
2468 const IntrinsicType a2( A.load(i1,k) );
2469 const IntrinsicType b1(
set( B(k,j ) ) );
2470 const IntrinsicType b2(
set( B(k,j+1UL) ) );
2471 xmm1 = xmm1 + a1 * b1;
2472 xmm2 = xmm2 + a2 * b1;
2473 xmm3 = xmm3 + a1 * b2;
2474 xmm4 = xmm4 + a2 * b2;
2477 (~C).store( i , j , xmm1 );
2478 (~C).store( i1, j , xmm2 );
2479 (~C).store( i , j+1UL, xmm3 );
2480 (~C).store( i1, j+1UL, xmm4 );
2485 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2486 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2487 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
2488 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2490 IntrinsicType xmm1( (~C).load(i ,j) );
2491 IntrinsicType xmm2( (~C).load(i1,j) );
2493 for(
size_t k=kbegin; k<kend; ++k ) {
2494 const IntrinsicType b1(
set( B(k,j) ) );
2495 xmm1 = xmm1 + A.load(i ,k) * b1;
2496 xmm2 = xmm2 + A.load(i1,k) * b1;
2499 (~C).store( i , j, xmm1 );
2500 (~C).store( i1, j, xmm2 );
2506 for(
size_t j=jj; j<jend; ++j )
2508 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2509 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2510 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
2511 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2513 IntrinsicType xmm1( (~C).load(i,j) );
2515 for(
size_t k=kbegin; k<kend; ++k ) {
2516 const IntrinsicType b1(
set( B(k,j) ) );
2517 xmm1 = xmm1 + A.load(i,k) * b1;
2520 (~C).store( i, j, xmm1 );
2524 for( ; remainder && i<iend; ++i )
2526 for(
size_t j=jj; j<jend; ++j )
2528 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2529 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2530 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
2531 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2533 ElementType value( (~C)(i,j) );
2535 for(
size_t k=kbegin; k<kend; ++k ) {
2536 value += A(i,k) * B(k,j);
2563 template<
typename MT3
2566 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2567 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2569 selectLargeAddAssignKernel( C, A, B );
2589 template<
typename MT3
2592 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2593 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2597 if( IsTriangular<MT4>::value ) {
2599 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2600 addAssign( C, tmp );
2602 else if( IsTriangular<MT5>::value ) {
2604 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2605 addAssign( C, tmp );
2608 gemm( C, A, B, ET(1), ET(1) );
2630 template<
typename MT >
2631 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2641 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2642 addAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
2643 else if( IsSymmetric<MT1>::value )
2644 addAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
2646 addAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
2668 template<
typename MT
2670 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2678 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2682 LT A(
serial( rhs.lhs_ ) );
2683 RT B(
serial( rhs.rhs_ ) );
2692 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2708 template<
typename MT3
2711 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2713 if( ( IsDiagonal<MT4>::value ) ||
2714 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2715 selectSmallSubAssignKernel( C, A, B );
2717 selectBlasSubAssignKernel( C, A, B );
2736 template<
typename MT3
2739 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2740 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2742 const size_t M( A.rows() );
2743 const size_t N( B.columns() );
2744 const size_t K( A.columns() );
2746 for(
size_t j=0UL; j<N; ++j )
2748 const size_t kbegin( ( IsLower<MT5>::value )
2749 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2751 const size_t kend( ( IsUpper<MT5>::value )
2752 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2756 for(
size_t k=kbegin; k<kend; ++k )
2758 const size_t ibegin( ( IsLower<MT4>::value )
2759 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2761 const size_t iend( ( IsUpper<MT4>::value )
2762 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2766 const size_t inum( iend - ibegin );
2767 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2769 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2770 C(i ,j) -= A(i ,k) * B(k,j);
2771 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
2774 C(ipos,j) -= A(ipos,k) * B(k,j);
2796 template<
typename MT3
2799 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2800 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2804 const size_t M( A.rows() );
2805 const size_t N( B.columns() );
2807 for(
size_t j=0UL; j<N; ++j )
2809 const size_t ibegin( ( IsLower<MT4>::value )
2810 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2812 const size_t iend( ( IsUpper<MT4>::value )
2813 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2817 const size_t inum( iend - ibegin );
2818 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2820 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2821 C(i ,j) -= A(i ,j) * B(j,j);
2822 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
2825 C(ipos,j) -= A(ipos,j) * B(j,j);
2846 template<
typename MT3
2849 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2850 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2854 const size_t M( A.rows() );
2855 const size_t N( B.columns() );
2857 for(
size_t j=0UL; j<N; ++j )
2859 const size_t ibegin( ( IsLower<MT5>::value )
2860 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2862 const size_t iend( ( IsUpper<MT5>::value )
2863 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2867 const size_t inum( iend - ibegin );
2868 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2870 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2871 C(i ,j) -= A(i ,i ) * B(i ,j);
2872 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
2875 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
2896 template<
typename MT3
2899 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2900 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2904 for(
size_t i=0UL; i<A.rows(); ++i ) {
2905 C(i,i) -= A(i,i) * B(i,i);
2925 template<
typename MT3
2928 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2929 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2931 selectDefaultSubAssignKernel( C, A, B );
2951 template<
typename MT3
2954 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2955 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2962 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2964 subAssign( ~C, A * tmp );
2966 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2968 subAssign( ~C, tmp * B );
2970 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2972 subAssign( ~C, A * tmp );
2976 subAssign( ~C, tmp * B );
2997 template<
typename MT3
3000 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3001 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3003 typedef IntrinsicTrait<ElementType> IT;
3005 const size_t M( A.rows() );
3006 const size_t N( B.columns() );
3007 const size_t K( A.columns() );
3009 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3011 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
3017 for(
size_t j=0UL; j<N; ++j )
3019 const size_t kbegin( ( IsLower<MT5>::value )
3020 ?( ( IsUpper<MT4>::value )
3021 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3022 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3023 :( IsUpper<MT4>::value ? i : 0UL ) );
3024 const size_t kend( ( IsUpper<MT5>::value )
3025 ?( ( IsLower<MT4>::value )
3026 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3027 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3028 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
3030 IntrinsicType xmm1( (~C).load(i ,j) );
3031 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
3032 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
3033 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
3034 IntrinsicType xmm5( (~C).load(i+
IT::size*4UL,j) );
3035 IntrinsicType xmm6( (~C).load(i+
IT::size*5UL,j) );
3036 IntrinsicType xmm7( (~C).load(i+
IT::size*6UL,j) );
3037 IntrinsicType xmm8( (~C).load(i+
IT::size*7UL,j) );
3039 for(
size_t k=kbegin; k<kend; ++k ) {
3040 const IntrinsicType b1(
set( B(k,j) ) );
3041 xmm1 = xmm1 - A.load(i ,k) * b1;
3042 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
3043 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
3044 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
3045 xmm5 = xmm5 - A.load(i+
IT::size*4UL,k) * b1;
3046 xmm6 = xmm6 - A.load(i+
IT::size*5UL,k) * b1;
3047 xmm7 = xmm7 - A.load(i+
IT::size*6UL,k) * b1;
3048 xmm8 = xmm8 - A.load(i+
IT::size*7UL,k) * b1;
3051 (~C).store( i , j, xmm1 );
3052 (~C).store( i+
IT::size , j, xmm2 );
3053 (~C).store( i+
IT::size*2UL, j, xmm3 );
3054 (~C).store( i+
IT::size*3UL, j, xmm4 );
3055 (~C).store( i+
IT::size*4UL, j, xmm5 );
3056 (~C).store( i+
IT::size*5UL, j, xmm6 );
3057 (~C).store( i+
IT::size*6UL, j, xmm7 );
3058 (~C).store( i+
IT::size*7UL, j, xmm8 );
3066 for( ; (j+2UL) <= N; j+=2UL )
3068 const size_t kbegin( ( IsLower<MT5>::value )
3069 ?( ( IsUpper<MT4>::value )
3070 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3071 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3072 :( IsUpper<MT4>::value ? i : 0UL ) );
3073 const size_t kend( ( IsUpper<MT5>::value )
3074 ?( ( IsLower<MT4>::value )
3075 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3076 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3077 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
3079 IntrinsicType xmm1( (~C).load(i ,j ) );
3080 IntrinsicType xmm2( (~C).load(i+
IT::size ,j ) );
3081 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j ) );
3082 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j ) );
3083 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3084 IntrinsicType xmm6( (~C).load(i+
IT::size ,j+1UL) );
3085 IntrinsicType xmm7( (~C).load(i+
IT::size*2UL,j+1UL) );
3086 IntrinsicType xmm8( (~C).load(i+
IT::size*3UL,j+1UL) );
3088 for(
size_t k=kbegin; k<kend; ++k ) {
3089 const IntrinsicType a1( A.load(i ,k) );
3090 const IntrinsicType a2( A.load(i+
IT::size ,k) );
3091 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
3092 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
3093 const IntrinsicType b1(
set( B(k,j ) ) );
3094 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3095 xmm1 = xmm1 - a1 * b1;
3096 xmm2 = xmm2 - a2 * b1;
3097 xmm3 = xmm3 - a3 * b1;
3098 xmm4 = xmm4 - a4 * b1;
3099 xmm5 = xmm5 - a1 * b2;
3100 xmm6 = xmm6 - a2 * b2;
3101 xmm7 = xmm7 - a3 * b2;
3102 xmm8 = xmm8 - a4 * b2;
3105 (~C).store( i , j , xmm1 );
3106 (~C).store( i+
IT::size , j , xmm2 );
3107 (~C).store( i+
IT::size*2UL, j , xmm3 );
3108 (~C).store( i+
IT::size*3UL, j , xmm4 );
3109 (~C).store( i , j+1UL, xmm5 );
3110 (~C).store( i+
IT::size , j+1UL, xmm6 );
3111 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 );
3112 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 );
3117 const size_t kbegin( ( IsLower<MT5>::value )
3118 ?( ( IsUpper<MT4>::value )
3119 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3120 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3121 :( IsUpper<MT4>::value ? i : 0UL ) );
3122 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
3124 IntrinsicType xmm1( (~C).load(i ,j) );
3125 IntrinsicType xmm2( (~C).load(i+
IT::size ,j) );
3126 IntrinsicType xmm3( (~C).load(i+
IT::size*2UL,j) );
3127 IntrinsicType xmm4( (~C).load(i+
IT::size*3UL,j) );
3129 for(
size_t k=kbegin; k<kend; ++k ) {
3130 const IntrinsicType b1(
set( B(k,j) ) );
3131 xmm1 = xmm1 - A.load(i ,k) * b1;
3132 xmm2 = xmm2 - A.load(i+
IT::size ,k) * b1;
3133 xmm3 = xmm3 - A.load(i+
IT::size*2UL,k) * b1;
3134 xmm4 = xmm4 - A.load(i+
IT::size*3UL,k) * b1;
3137 (~C).store( i , j, xmm1 );
3138 (~C).store( i+
IT::size , j, xmm2 );
3139 (~C).store( i+
IT::size*2UL, j, xmm3 );
3140 (~C).store( i+
IT::size*3UL, j, xmm4 );
3148 for( ; (j+2UL) <= N; j+=2UL )
3150 const size_t kbegin( ( IsLower<MT5>::value )
3151 ?( ( IsUpper<MT4>::value )
3152 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3153 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3154 :( IsUpper<MT4>::value ? i : 0UL ) );
3155 const size_t kend( ( IsUpper<MT5>::value )
3156 ?( ( IsLower<MT4>::value )
3157 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3158 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3159 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
3161 IntrinsicType xmm1( (~C).load(i ,j ) );
3162 IntrinsicType xmm2( (~C).load(i+
IT::size,j ) );
3163 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3164 IntrinsicType xmm4( (~C).load(i+
IT::size,j+1UL) );
3166 for(
size_t k=kbegin; k<kend; ++k ) {
3167 const IntrinsicType a1( A.load(i ,k) );
3168 const IntrinsicType a2( A.load(i+
IT::size,k) );
3169 const IntrinsicType b1(
set( B(k,j ) ) );
3170 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3171 xmm1 = xmm1 - a1 * b1;
3172 xmm2 = xmm2 - a2 * b1;
3173 xmm3 = xmm3 - a1 * b2;
3174 xmm4 = xmm4 - a2 * b2;
3177 (~C).store( i , j , xmm1 );
3178 (~C).store( i+
IT::size, j , xmm2 );
3179 (~C).store( i , j+1UL, xmm3 );
3180 (~C).store( i+
IT::size, j+1UL, xmm4 );
3185 const size_t kbegin( ( IsLower<MT5>::value )
3186 ?( ( IsUpper<MT4>::value )
3187 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3188 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3189 :( IsUpper<MT4>::value ? i : 0UL ) );
3190 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
3192 IntrinsicType xmm1( (~C).load(i ,j) );
3193 IntrinsicType xmm2( (~C).load(i+
IT::size,j) );
3195 for(
size_t k=kbegin; k<kend; ++k ) {
3196 const IntrinsicType b1(
set( B(k,j) ) );
3197 xmm1 = xmm1 - A.load(i ,k) * b1;
3198 xmm2 = xmm2 - A.load(i+
IT::size,k) * b1;
3201 (~C).store( i , j, xmm1 );
3210 for( ; (j+2UL) <= N; j+=2UL )
3212 const size_t kbegin( ( IsLower<MT5>::value )
3213 ?( ( IsUpper<MT4>::value )
3214 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3215 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3216 :( IsUpper<MT4>::value ? i : 0UL ) );
3217 const size_t kend( ( IsUpper<MT5>::value )
3218 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3221 IntrinsicType xmm1( (~C).load(i,j ) );
3222 IntrinsicType xmm2( (~C).load(i,j+1UL) );
3224 for(
size_t k=kbegin; k<kend; ++k ) {
3225 const IntrinsicType a1( A.load(i,k) );
3226 xmm1 = xmm1 - a1 *
set( B(k,j ) );
3227 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
3230 (~C).store( i, j , xmm1 );
3231 (~C).store( i, j+1UL, xmm2 );
3236 const size_t kbegin( ( IsLower<MT5>::value )
3237 ?( ( IsUpper<MT4>::value )
3238 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3239 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3240 :( IsUpper<MT4>::value ? i : 0UL ) );
3242 IntrinsicType xmm1( (~C).load(i,j) );
3244 for(
size_t k=kbegin; k<K; ++k ) {
3245 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
3248 (~C).store( i, j, xmm1 );
3252 for( ; remainder && i<M; ++i )
3256 for( ; (j+2UL) <= N; j+=2UL )
3258 const size_t kbegin( ( IsLower<MT5>::value )
3259 ?( ( IsUpper<MT4>::value )
3260 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3261 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3262 :( IsUpper<MT4>::value ? i : 0UL ) );
3263 const size_t kend( ( IsUpper<MT5>::value )
3264 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3267 ElementType value1( (~C)(i,j ) );
3268 ElementType value2( (~C)(i,j+1UL) );
3270 for(
size_t k=kbegin; k<kend; ++k ) {
3271 value1 -= A(i,k) * B(k,j );
3272 value2 -= A(i,k) * B(k,j+1UL);
3275 (~C)(i,j ) = value1;
3276 (~C)(i,j+1UL) = value2;
3281 const size_t kbegin( ( IsLower<MT5>::value )
3282 ?( ( IsUpper<MT4>::value )
3283 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3284 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3285 :( IsUpper<MT4>::value ? i : 0UL ) );
3287 ElementType value( (~C)(i,j) );
3289 for(
size_t k=kbegin; k<K; ++k ) {
3290 value -= A(i,k) * B(k,j);
3314 template<
typename MT3
3317 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3318 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3320 selectDefaultSubAssignKernel( C, A, B );
3340 template<
typename MT3
3343 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3344 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3346 selectSmallSubAssignKernel( ~C, A, B );
3366 template<
typename MT3
3369 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3370 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3372 typedef IntrinsicTrait<ElementType> IT;
3374 const size_t M( A.rows() );
3375 const size_t N( B.columns() );
3376 const size_t K( A.columns() );
3378 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3380 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
3382 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
3384 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3387 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
3389 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
3391 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
3393 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
3405 for( ; (j+2UL) <= jend; j+=2UL )
3407 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3408 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3409 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3410 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3412 IntrinsicType xmm1( (~C).load(i ,j ) );
3413 IntrinsicType xmm2( (~C).load(i1,j ) );
3414 IntrinsicType xmm3( (~C).load(i2,j ) );
3415 IntrinsicType xmm4( (~C).load(i3,j ) );
3416 IntrinsicType xmm5( (~C).load(i ,j+1UL) );
3417 IntrinsicType xmm6( (~C).load(i1,j+1UL) );
3418 IntrinsicType xmm7( (~C).load(i2,j+1UL) );
3419 IntrinsicType xmm8( (~C).load(i3,j+1UL) );
3421 for(
size_t k=kbegin; k<kend; ++k ) {
3422 const IntrinsicType a1( A.load(i ,k) );
3423 const IntrinsicType a2( A.load(i1,k) );
3424 const IntrinsicType a3( A.load(i2,k) );
3425 const IntrinsicType a4( A.load(i3,k) );
3426 const IntrinsicType b1(
set( B(k,j ) ) );
3427 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3428 xmm1 = xmm1 - a1 * b1;
3429 xmm2 = xmm2 - a2 * b1;
3430 xmm3 = xmm3 - a3 * b1;
3431 xmm4 = xmm4 - a4 * b1;
3432 xmm5 = xmm5 - a1 * b2;
3433 xmm6 = xmm6 - a2 * b2;
3434 xmm7 = xmm7 - a3 * b2;
3435 xmm8 = xmm8 - a4 * b2;
3438 (~C).store( i , j , xmm1 );
3439 (~C).store( i1, j , xmm2 );
3440 (~C).store( i2, j , xmm3 );
3441 (~C).store( i3, j , xmm4 );
3442 (~C).store( i , j+1UL, xmm5 );
3443 (~C).store( i1, j+1UL, xmm6 );
3444 (~C).store( i2, j+1UL, xmm7 );
3445 (~C).store( i3, j+1UL, xmm8 );
3450 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3451 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3452 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
3453 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3455 IntrinsicType xmm1( (~C).load(i ,j) );
3456 IntrinsicType xmm2( (~C).load(i1,j) );
3457 IntrinsicType xmm3( (~C).load(i2,j) );
3458 IntrinsicType xmm4( (~C).load(i3,j) );
3460 for(
size_t k=kbegin; k<kend; ++k ) {
3461 const IntrinsicType b1(
set( B(k,j) ) );
3462 xmm1 = xmm1 - A.load(i ,k) * b1;
3463 xmm2 = xmm2 - A.load(i1,k) * b1;
3464 xmm3 = xmm3 - A.load(i2,k) * b1;
3465 xmm4 = xmm4 - A.load(i3,k) * b1;
3468 (~C).store( i , j, xmm1 );
3469 (~C).store( i1, j, xmm2 );
3470 (~C).store( i2, j, xmm3 );
3471 (~C).store( i3, j, xmm4 );
3481 for( ; (j+4UL) <= jend; j+=4UL )
3483 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3484 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3485 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3486 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3488 IntrinsicType xmm1( (~C).load(i ,j ) );
3489 IntrinsicType xmm2( (~C).load(i1,j ) );
3490 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3491 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3492 IntrinsicType xmm5( (~C).load(i ,j+2UL) );
3493 IntrinsicType xmm6( (~C).load(i1,j+2UL) );
3494 IntrinsicType xmm7( (~C).load(i ,j+3UL) );
3495 IntrinsicType xmm8( (~C).load(i1,j+3UL) );
3497 for(
size_t k=kbegin; k<kend; ++k ) {
3498 const IntrinsicType a1( A.load(i ,k) );
3499 const IntrinsicType a2( A.load(i1,k) );
3500 const IntrinsicType b1(
set( B(k,j ) ) );
3501 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3502 const IntrinsicType b3(
set( B(k,j+2UL) ) );
3503 const IntrinsicType b4(
set( B(k,j+3UL) ) );
3504 xmm1 = xmm1 - a1 * b1;
3505 xmm2 = xmm2 - a2 * b1;
3506 xmm3 = xmm3 - a1 * b2;
3507 xmm4 = xmm4 - a2 * b2;
3508 xmm5 = xmm5 - a1 * b3;
3509 xmm6 = xmm6 - a2 * b3;
3510 xmm7 = xmm7 - a1 * b4;
3511 xmm8 = xmm8 - a2 * b4;
3514 (~C).store( i , j , xmm1 );
3515 (~C).store( i1, j , xmm2 );
3516 (~C).store( i , j+1UL, xmm3 );
3517 (~C).store( i1, j+1UL, xmm4 );
3518 (~C).store( i , j+2UL, xmm5 );
3519 (~C).store( i1, j+2UL, xmm6 );
3520 (~C).store( i , j+3UL, xmm7 );
3521 (~C).store( i1, j+3UL, xmm8 );
3524 for( ; (j+2UL) <= jend; j+=2UL )
3526 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3527 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3528 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3529 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3531 IntrinsicType xmm1( (~C).load(i ,j ) );
3532 IntrinsicType xmm2( (~C).load(i1,j ) );
3533 IntrinsicType xmm3( (~C).load(i ,j+1UL) );
3534 IntrinsicType xmm4( (~C).load(i1,j+1UL) );
3536 for(
size_t k=kbegin; k<kend; ++k ) {
3537 const IntrinsicType a1( A.load(i ,k) );
3538 const IntrinsicType a2( A.load(i1,k) );
3539 const IntrinsicType b1(
set( B(k,j ) ) );
3540 const IntrinsicType b2(
set( B(k,j+1UL) ) );
3541 xmm1 = xmm1 - a1 * b1;
3542 xmm2 = xmm2 - a2 * b1;
3543 xmm3 = xmm3 - a1 * b2;
3544 xmm4 = xmm4 - a2 * b2;
3547 (~C).store( i , j , xmm1 );
3548 (~C).store( i1, j , xmm2 );
3549 (~C).store( i , j+1UL, xmm3 );
3550 (~C).store( i1, j+1UL, xmm4 );
3555 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3556 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3557 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
3558 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3560 IntrinsicType xmm1( (~C).load(i ,j) );
3561 IntrinsicType xmm2( (~C).load(i1,j) );
3563 for(
size_t k=kbegin; k<kend; ++k ) {
3564 const IntrinsicType b1(
set( B(k,j) ) );
3565 xmm1 = xmm1 - A.load(i ,k) * b1;
3566 xmm2 = xmm2 - A.load(i1,k) * b1;
3569 (~C).store( i , j, xmm1 );
3570 (~C).store( i1, j, xmm2 );
3576 for(
size_t j=jj; j<jend; ++j )
3578 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3579 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3580 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
3581 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3583 IntrinsicType xmm1( (~C).load(i,j) );
3585 for(
size_t k=kbegin; k<kend; ++k ) {
3586 const IntrinsicType b1(
set( B(k,j) ) );
3587 xmm1 = xmm1 - A.load(i,k) * b1;
3590 (~C).store( i, j, xmm1 );
3594 for( ; remainder && i<iend; ++i )
3596 for(
size_t j=jj; j<jend; ++j )
3598 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3599 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3600 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
3601 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3603 ElementType value( (~C)(i,j) );
3605 for(
size_t k=kbegin; k<kend; ++k ) {
3606 value -= A(i,k) * B(k,j);
3633 template<
typename MT3
3636 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3637 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3639 selectLargeSubAssignKernel( C, A, B );
3659 template<
typename MT3
3662 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3663 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3667 if( IsTriangular<MT4>::value ) {
3669 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3670 subAssign( C, tmp );
3672 else if( IsTriangular<MT5>::value ) {
3674 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3675 subAssign( C, tmp );
3678 gemm( C, A, B, ET(-1), ET(1) );
3701 template<
typename MT >
3702 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3712 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3713 subAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
3714 else if( IsSymmetric<MT1>::value )
3715 subAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
3717 subAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
3750 template<
typename MT
3752 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3760 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3763 else if( rhs.lhs_.columns() == 0UL ) {
3799 template<
typename MT
3801 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3806 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
3818 const TmpType tmp( rhs );
3839 template<
typename MT >
3840 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3850 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3852 else if( IsSymmetric<MT1>::value )
3876 template<
typename MT
3878 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3886 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3921 template<
typename MT >
3922 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3932 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3934 else if( IsSymmetric<MT1>::value )
3962 template<
typename MT
3964 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3972 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4007 template<
typename MT >
4008 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4018 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4020 else if( IsSymmetric<MT1>::value )
4069 template<
typename MT1
4073 :
public DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2>, ST, true >, true >
4074 ,
private MatScalarMultExpr
4075 ,
private Computation
4079 typedef TDMatTDMatMultExpr<MT1,MT2> MMM;
4091 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4096 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4106 template<
typename T1,
typename T2,
typename T3 >
4107 struct CanExploitSymmetry {
4108 enum { value = IsRowMajorMatrix<T1>::value &&
4109 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4118 template<
typename T1,
typename T2,
typename T3 >
4119 struct IsEvaluationRequired {
4120 enum { value = ( evaluateLeft || evaluateRight ) &&
4121 !CanExploitSymmetry<T1,T2,T3>::value };
4129 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4130 struct UseBlasKernel {
4132 HasMutableDataAccess<T1>::value &&
4133 HasConstDataAccess<T2>::value &&
4134 HasConstDataAccess<T3>::value &&
4135 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4136 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4137 IsBlasCompatible<typename T1::ElementType>::value &&
4138 IsBlasCompatible<typename T2::ElementType>::value &&
4139 IsBlasCompatible<typename T3::ElementType>::value &&
4140 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4141 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4142 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4150 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4151 struct UseVectorizedDefaultKernel {
4153 !IsDiagonal<T2>::value &&
4154 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4155 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4156 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4157 IsSame<typename T1::ElementType,T4>::value &&
4158 IntrinsicTrait<typename T1::ElementType>::addition &&
4159 IntrinsicTrait<typename T1::ElementType>::subtraction &&
4160 IntrinsicTrait<typename T1::ElementType>::multiplication };
4166 typedef DMatScalarMultExpr<MMM,ST,true>
This;
4167 typedef typename MultTrait<RES,ST>::Type
ResultType;
4171 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
4176 typedef const TDMatTDMatMultExpr<MT1,MT2>
LeftOperand;
4182 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
4185 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
4190 enum { vectorizable = !IsDiagonal<MT1>::value &&
4191 MT1::vectorizable && MT2::vectorizable &&
4192 IsSame<ET1,ET2>::value &&
4193 IsSame<ET1,ST>::value &&
4194 IntrinsicTrait<ET1>::addition &&
4195 IntrinsicTrait<ET1>::multiplication };
4198 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4199 !evaluateRight && MT2::smpAssignable };
4208 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4221 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4224 return matrix_(i,j) * scalar_;
4236 inline ReturnType
at(
size_t i,
size_t j )
const {
4237 if( i >= matrix_.rows() ) {
4240 if( j >= matrix_.columns() ) {
4243 return (*
this)(i,j);
4252 inline size_t rows()
const {
4253 return matrix_.rows();
4262 inline size_t columns()
const {
4263 return matrix_.columns();
4293 template<
typename T >
4294 inline bool canAlias(
const T* alias )
const {
4295 return matrix_.canAlias( alias );
4305 template<
typename T >
4306 inline bool isAliased(
const T* alias )
const {
4307 return matrix_.isAliased( alias );
4317 return matrix_.isAligned();
4327 typename MMM::RightOperand B( matrix_.rightOperand() );
4329 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4330 ( B.columns() > SMP_TDMATTDMATMULT_THRESHOLD );
4336 LeftOperand matrix_;
4337 RightOperand scalar_;
4352 template<
typename MT
4354 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4355 assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4362 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4363 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4365 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4368 else if( left.columns() == 0UL ) {
4383 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4398 template<
typename MT3
4402 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4404 if( ( IsDiagonal<MT4>::value ) ||
4405 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
4406 selectSmallAssignKernel( C, A, B, scalar );
4408 selectBlasAssignKernel( C, A, B, scalar );
4426 template<
typename MT3
4430 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4431 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4433 const size_t M( A.rows() );
4434 const size_t N( B.columns() );
4435 const size_t K( A.columns() );
4437 for(
size_t j=0UL; j<N; ++j )
4439 const size_t kbegin( ( IsLower<MT5>::value )
4440 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4442 const size_t kend( ( IsUpper<MT5>::value )
4443 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4447 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
4448 for(
size_t i=0UL; i<M; ++i ) {
4455 const size_t ibegin( ( IsLower<MT4>::value )
4456 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
4458 const size_t iend( ( IsUpper<MT4>::value )
4459 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
4463 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4464 for(
size_t i=0UL; i<ibegin; ++i ) {
4468 else if( IsStrictlyLower<MT4>::value ) {
4471 for(
size_t i=ibegin; i<iend; ++i ) {
4472 C(i,j) = A(i,kbegin) * B(kbegin,j);
4474 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4475 for(
size_t i=iend; i<M; ++i ) {
4479 else if( IsStrictlyUpper<MT4>::value ) {
4480 reset( C(M-1UL,j) );
4484 for(
size_t k=kbegin+1UL; k<kend; ++k )
4486 const size_t ibegin( ( IsLower<MT4>::value )
4487 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4489 const size_t iend( ( IsUpper<MT4>::value )
4490 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
4494 for(
size_t i=ibegin; i<iend; ++i ) {
4495 C(i,j) += A(i,k) * B(k,j);
4497 if( IsUpper<MT4>::value ) {
4498 C(iend,j) = A(iend,k) * B(k,j);
4503 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
4504 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
4506 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4507 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
4511 for(
size_t i=ibegin; i<iend; ++i ) {
4533 template<
typename MT3
4537 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4538 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4542 const size_t M( A.rows() );
4543 const size_t N( B.columns() );
4545 for(
size_t j=0UL; j<N; ++j )
4547 const size_t ibegin( ( IsLower<MT4>::value )
4548 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4550 const size_t iend( ( IsUpper<MT4>::value )
4551 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4555 if( IsLower<MT4>::value ) {
4556 for(
size_t i=0UL; i<ibegin; ++i ) {
4560 for(
size_t i=ibegin; i<iend; ++i ) {
4561 C(i,j) = A(i,j) * B(j,j) * scalar;
4563 if( IsUpper<MT4>::value ) {
4564 for(
size_t i=iend; i<M; ++i ) {
4586 template<
typename MT3
4590 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4591 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4595 const size_t M( A.rows() );
4596 const size_t N( B.columns() );
4598 for(
size_t j=0UL; j<N; ++j )
4600 const size_t ibegin( ( IsLower<MT5>::value )
4601 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4603 const size_t iend( ( IsUpper<MT5>::value )
4604 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4608 if( IsLower<MT4>::value ) {
4609 for(
size_t i=0UL; i<ibegin; ++i ) {
4613 for(
size_t i=ibegin; i<iend; ++i ) {
4614 C(i,j) = A(i,i) * B(i,j) * scalar;
4616 if( IsUpper<MT4>::value ) {
4617 for(
size_t i=iend; i<M; ++i ) {
4639 template<
typename MT3
4643 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4644 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4650 for(
size_t i=0UL; i<A.rows(); ++i ) {
4651 C(i,i) = A(i,i) * B(i,i) * scalar;
4670 template<
typename MT3
4674 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4675 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4677 selectDefaultAssignKernel( C, A, B, scalar );
4696 template<
typename MT3
4700 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4701 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4708 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
4710 assign( ~C, A * tmp * scalar );
4712 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4714 assign( ~C, tmp * B * scalar );
4716 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
4718 assign( ~C, A * tmp * scalar );
4722 assign( ~C, tmp * B * scalar );
4742 template<
typename MT3
4746 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4747 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4749 typedef IntrinsicTrait<ElementType> IT;
4751 const size_t M( A.rows() );
4752 const size_t N( B.columns() );
4753 const size_t K( A.columns() );
4755 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4757 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
4760 const IntrinsicType factor(
set( scalar ) );
4765 for(
size_t j=0UL; j<N; ++j )
4767 const size_t kbegin( ( IsLower<MT5>::value )
4768 ?( ( IsUpper<MT4>::value )
4769 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4770 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4771 :( IsUpper<MT4>::value ? i : 0UL ) );
4772 const size_t kend( ( IsUpper<MT5>::value )
4773 ?( ( IsLower<MT4>::value )
4774 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4775 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4776 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
4778 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4780 for(
size_t k=kbegin; k<kend; ++k ) {
4781 const IntrinsicType b1(
set( B(k,j) ) );
4782 xmm1 = xmm1 + A.load(i ,k) * b1;
4783 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4784 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4785 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4786 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
4787 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
4788 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
4789 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
4792 (~C).store( i , j, xmm1 * factor );
4793 (~C).store( i+
IT::size , j, xmm2 * factor );
4794 (~C).store( i+
IT::size*2UL, j, xmm3 * factor );
4795 (~C).store( i+
IT::size*3UL, j, xmm4 * factor );
4796 (~C).store( i+
IT::size*4UL, j, xmm5 * factor );
4797 (~C).store( i+
IT::size*5UL, j, xmm6 * factor );
4798 (~C).store( i+
IT::size*6UL, j, xmm7 * factor );
4799 (~C).store( i+
IT::size*7UL, j, xmm8 * factor );
4807 for( ; (j+2UL) <= N; j+=2UL )
4809 const size_t kbegin( ( IsLower<MT5>::value )
4810 ?( ( IsUpper<MT4>::value )
4811 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4812 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4813 :( IsUpper<MT4>::value ? i : 0UL ) );
4814 const size_t kend( ( IsUpper<MT5>::value )
4815 ?( ( IsLower<MT4>::value )
4816 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4817 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4818 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
4820 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4822 for(
size_t k=kbegin; k<kend; ++k ) {
4823 const IntrinsicType a1( A.load(i ,k) );
4824 const IntrinsicType a2( A.load(i+
IT::size ,k) );
4825 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
4826 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
4827 const IntrinsicType b1(
set( B(k,j ) ) );
4828 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4829 xmm1 = xmm1 + a1 * b1;
4830 xmm2 = xmm2 + a2 * b1;
4831 xmm3 = xmm3 + a3 * b1;
4832 xmm4 = xmm4 + a4 * b1;
4833 xmm5 = xmm5 + a1 * b2;
4834 xmm6 = xmm6 + a2 * b2;
4835 xmm7 = xmm7 + a3 * b2;
4836 xmm8 = xmm8 + a4 * b2;
4839 (~C).store( i , j , xmm1 * factor );
4840 (~C).store( i+
IT::size , j , xmm2 * factor );
4841 (~C).store( i+
IT::size*2UL, j , xmm3 * factor );
4842 (~C).store( i+
IT::size*3UL, j , xmm4 * factor );
4843 (~C).store( i , j+1UL, xmm5 * factor );
4844 (~C).store( i+
IT::size , j+1UL, xmm6 * factor );
4845 (~C).store( i+
IT::size*2UL, j+1UL, xmm7 * factor );
4846 (~C).store( i+
IT::size*3UL, j+1UL, xmm8 * factor );
4851 const size_t kbegin( ( IsLower<MT5>::value )
4852 ?( ( IsUpper<MT4>::value )
4853 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4854 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4855 :( IsUpper<MT4>::value ? i : 0UL ) );
4856 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
4858 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4860 for(
size_t k=kbegin; k<kend; ++k ) {
4861 const IntrinsicType b1(
set( B(k,j) ) );
4862 xmm1 = xmm1 + A.load(i ,k) * b1;
4863 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
4864 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
4865 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
4868 (~C).store( i , j, xmm1 * factor );
4869 (~C).store( i+
IT::size , j, xmm2 * factor );
4870 (~C).store( i+
IT::size*2UL, j, xmm3 * factor );
4871 (~C).store( i+
IT::size*3UL, j, xmm4 * factor );
4879 for( ; (j+2UL) <= N; j+=2UL )
4881 const size_t kbegin( ( IsLower<MT5>::value )
4882 ?( ( IsUpper<MT4>::value )
4883 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4884 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4885 :( IsUpper<MT4>::value ? i : 0UL ) );
4886 const size_t kend( ( IsUpper<MT5>::value )
4887 ?( ( IsLower<MT4>::value )
4888 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4889 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4890 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
4892 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4894 for(
size_t k=kbegin; k<kend; ++k ) {
4895 const IntrinsicType a1( A.load(i ,k) );
4896 const IntrinsicType a2( A.load(i+
IT::size,k) );
4897 const IntrinsicType b1(
set( B(k,j ) ) );
4898 const IntrinsicType b2(
set( B(k,j+1UL) ) );
4899 xmm1 = xmm1 + a1 * b1;
4900 xmm2 = xmm2 + a2 * b1;
4901 xmm3 = xmm3 + a1 * b2;
4902 xmm4 = xmm4 + a2 * b2;
4905 (~C).store( i , j , xmm1 * factor );
4906 (~C).store( i+
IT::size, j , xmm2 * factor );
4907 (~C).store( i , j+1UL, xmm3 * factor );
4908 (~C).store( i+
IT::size, j+1UL, xmm4 * factor );
4913 const size_t kbegin( ( IsLower<MT5>::value )
4914 ?( ( IsUpper<MT4>::value )
4915 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4916 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4917 :( IsUpper<MT4>::value ? i : 0UL ) );
4918 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
4920 IntrinsicType xmm1, xmm2;
4922 for(
size_t k=kbegin; k<kend; ++k ) {
4923 const IntrinsicType b1(
set( B(k,j) ) );
4924 xmm1 = xmm1 + A.load(i ,k) * b1;
4925 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
4928 (~C).store( i , j, xmm1 * factor );
4929 (~C).store( i+
IT::size, j, xmm2 * factor );
4937 for( ; (j+2UL) <= N; j+=2UL )
4939 const size_t kbegin( ( IsLower<MT5>::value )
4940 ?( ( IsUpper<MT4>::value )
4941 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4942 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4943 :( IsUpper<MT4>::value ? i : 0UL ) );
4944 const size_t kend( ( IsUpper<MT5>::value )
4945 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4948 IntrinsicType xmm1, xmm2;
4950 for(
size_t k=kbegin; k<kend; ++k ) {
4951 const IntrinsicType a1( A.load(i,k) );
4952 xmm1 = xmm1 + a1 *
set( B(k,j ) );
4953 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
4956 (~C).store( i, j , xmm1 * factor );
4957 (~C).store( i, j+1UL, xmm2 * factor );
4962 const size_t kbegin( ( IsLower<MT5>::value )
4963 ?( ( IsUpper<MT4>::value )
4964 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4965 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4966 :( IsUpper<MT4>::value ? i : 0UL ) );
4970 for(
size_t k=kbegin; k<K; ++k ) {
4971 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
4974 (~C).store( i, j, xmm1 * factor );
4978 for( ; remainder && i<M; ++i )
4982 for( ; (j+2UL) <= N; j+=2UL )
4984 const size_t kbegin( ( IsLower<MT5>::value )
4985 ?( ( IsUpper<MT4>::value )
4986 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4987 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4988 :( IsUpper<MT4>::value ? i : 0UL ) );
4989 const size_t kend( ( IsUpper<MT5>::value )
4990 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4996 for(
size_t k=kbegin; k<kend; ++k ) {
4997 value1 += A(i,k) * B(k,j );
4998 value2 += A(i,k) * B(k,j+1UL);
5001 (~C)(i,j ) = value1 * scalar;
5002 (~C)(i,j+1UL) = value2 * scalar;
5007 const size_t kbegin( ( IsLower<MT5>::value )
5008 ?( ( IsUpper<MT4>::value )
5009 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5010 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5011 :( IsUpper<MT4>::value ? i : 0UL ) );
5015 for(
size_t k=kbegin; k<K; ++k ) {
5016 value += A(i,k) * B(k,j);
5019 (~C)(i,j) = value * scalar;
5039 template<
typename MT3
5043 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5044 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5046 selectDefaultAssignKernel( C, A, B, scalar );
5065 template<
typename MT3
5069 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5070 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5072 selectSmallAssignKernel( ~C, A, B, scalar );
5091 template<
typename MT3
5095 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5096 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5098 typedef IntrinsicTrait<ElementType> IT;
5100 const size_t M( A.rows() );
5101 const size_t N( B.columns() );
5102 const size_t K( A.columns() );
5104 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5106 const IntrinsicType factor(
set( scalar ) );
5108 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
5110 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
5112 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
5115 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
5117 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
5119 for(
size_t j=jj; j<jend; ++j ) {
5120 for(
size_t i=ii; i<iend; ++i ) {
5125 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
5127 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
5139 for( ; (j+2UL) <= jend; j+=2UL )
5141 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5142 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5143 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5144 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5146 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5148 for(
size_t k=kbegin; k<kend; ++k ) {
5149 const IntrinsicType a1( A.load(i ,k) );
5150 const IntrinsicType a2( A.load(i1,k) );
5151 const IntrinsicType a3( A.load(i2,k) );
5152 const IntrinsicType a4( A.load(i3,k) );
5153 const IntrinsicType b1(
set( B(k,j ) ) );
5154 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5155 xmm1 = xmm1 + a1 * b1;
5156 xmm2 = xmm2 + a2 * b1;
5157 xmm3 = xmm3 + a3 * b1;
5158 xmm4 = xmm4 + a4 * b1;
5159 xmm5 = xmm5 + a1 * b2;
5160 xmm6 = xmm6 + a2 * b2;
5161 xmm7 = xmm7 + a3 * b2;
5162 xmm8 = xmm8 + a4 * b2;
5165 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5166 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5167 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
5168 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
5169 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5170 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
5171 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
5172 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
5177 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5178 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5179 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
5180 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5182 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5184 for(
size_t k=kbegin; k<kend; ++k ) {
5185 const IntrinsicType b1(
set( B(k,j) ) );
5186 xmm1 = xmm1 + A.load(i ,k) * b1;
5187 xmm2 = xmm2 + A.load(i1,k) * b1;
5188 xmm3 = xmm3 + A.load(i2,k) * b1;
5189 xmm4 = xmm4 + A.load(i3,k) * b1;
5192 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5193 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5194 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
5195 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
5205 for( ; (j+4UL) <= jend; j+=4UL )
5207 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5208 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5209 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5210 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5212 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5214 for(
size_t k=kbegin; k<kend; ++k ) {
5215 const IntrinsicType a1( A.load(i ,k) );
5216 const IntrinsicType a2( A.load(i1,k) );
5217 const IntrinsicType b1(
set( B(k,j ) ) );
5218 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5219 const IntrinsicType b3(
set( B(k,j+2UL) ) );
5220 const IntrinsicType b4(
set( B(k,j+3UL) ) );
5221 xmm1 = xmm1 + a1 * b1;
5222 xmm2 = xmm2 + a2 * b1;
5223 xmm3 = xmm3 + a1 * b2;
5224 xmm4 = xmm4 + a2 * b2;
5225 xmm5 = xmm5 + a1 * b3;
5226 xmm6 = xmm6 + a2 * b3;
5227 xmm7 = xmm7 + a1 * b4;
5228 xmm8 = xmm8 + a2 * b4;
5231 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5232 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5233 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5234 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5235 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
5236 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
5237 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
5238 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
5241 for( ; (j+2UL) <= jend; j+=2UL )
5243 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5244 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5245 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5246 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5248 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5250 for(
size_t k=kbegin; k<kend; ++k ) {
5251 const IntrinsicType a1( A.load(i ,k) );
5252 const IntrinsicType a2( A.load(i1,k) );
5253 const IntrinsicType b1(
set( B(k,j ) ) );
5254 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5255 xmm1 = xmm1 + a1 * b1;
5256 xmm2 = xmm2 + a2 * b1;
5257 xmm3 = xmm3 + a1 * b2;
5258 xmm4 = xmm4 + a2 * b2;
5261 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5262 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
5263 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5264 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
5269 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5270 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5271 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
5272 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5274 IntrinsicType xmm1, xmm2;
5276 for(
size_t k=kbegin; k<kend; ++k ) {
5277 const IntrinsicType b1(
set( B(k,j) ) );
5278 xmm1 = xmm1 + A.load(i ,k) * b1;
5279 xmm2 = xmm2 + A.load(i1,k) * b1;
5282 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5283 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
5289 for(
size_t j=jj; j<jend; ++j )
5291 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5292 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5293 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
5294 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5298 for(
size_t k=kbegin; k<kend; ++k ) {
5299 const IntrinsicType b1(
set( B(k,j) ) );
5300 xmm1 = xmm1 + A.load(i,k) * b1;
5303 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5307 for( ; remainder && i<iend; ++i )
5309 for(
size_t j=jj; j<jend; ++j )
5311 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5312 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5313 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
5314 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5318 for(
size_t k=kbegin; k<kend; ++k ) {
5319 value += A(i,k) * B(k,j);
5322 (~C)(i,j) += value * scalar;
5345 template<
typename MT3
5349 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5350 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5352 selectLargeAssignKernel( C, A, B, scalar );
5371 template<
typename MT3
5375 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5376 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5380 if( IsTriangular<MT4>::value ) {
5382 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5384 else if( IsTriangular<MT5>::value ) {
5386 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5389 gemm( C, A, B, ET(scalar), ET(0) );
5407 template<
typename MT
5409 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5410 assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5414 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
5426 const TmpType tmp(
serial( rhs ) );
5427 assign( ~lhs, tmp );
5445 template<
typename MT >
5446 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5447 assign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
5456 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5457 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5459 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5460 assign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
5461 else if( IsSymmetric<MT1>::value )
5462 assign( ~lhs,
trans( left ) * right * rhs.scalar_ );
5464 assign( ~lhs, left *
trans( right ) * rhs.scalar_ );
5480 template<
typename MT
5482 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5483 addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5490 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5491 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5493 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5507 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5522 template<
typename MT3
5526 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5528 if( ( IsDiagonal<MT4>::value ) ||
5529 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5530 selectSmallAddAssignKernel( C, A, B, scalar );
5532 selectBlasAddAssignKernel( C, A, B, scalar );
5550 template<
typename MT3
5554 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5555 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5557 const ResultType tmp(
serial( A * B * scalar ) );
5558 addAssign( C, tmp );
5576 template<
typename MT3
5580 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5581 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5585 const size_t M( A.rows() );
5586 const size_t N( B.columns() );
5588 for(
size_t j=0UL; j<N; ++j )
5590 const size_t ibegin( ( IsLower<MT4>::value )
5591 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
5593 const size_t iend( ( IsUpper<MT4>::value )
5594 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
5598 const size_t inum( iend - ibegin );
5599 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5601 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5602 C(i ,j) += A(i ,j) * B(j,j) * scalar;
5603 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
5606 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
5626 template<
typename MT3
5630 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5631 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5635 const size_t M( A.rows() );
5636 const size_t N( B.columns() );
5638 for(
size_t j=0UL; j<N; ++j )
5640 const size_t ibegin( ( IsLower<MT5>::value )
5641 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
5643 const size_t iend( ( IsUpper<MT5>::value )
5644 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
5648 const size_t inum( iend - ibegin );
5649 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5651 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5652 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
5653 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
5656 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
5676 template<
typename MT3
5680 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5681 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5685 for(
size_t i=0UL; i<A.rows(); ++i ) {
5686 C(i,i) += A(i,i) * B(i,i) * scalar;
5705 template<
typename MT3
5709 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5710 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5712 selectDefaultAddAssignKernel( C, A, B, scalar );
5731 template<
typename MT3
5735 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5736 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5743 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5745 addAssign( ~C, A * tmp * scalar );
5747 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
5749 addAssign( ~C, tmp * B * scalar );
5751 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5753 addAssign( ~C, A * tmp * scalar );
5757 addAssign( ~C, tmp * B * scalar );
5777 template<
typename MT3
5781 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5782 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5784 typedef IntrinsicTrait<ElementType> IT;
5786 const size_t M( A.rows() );
5787 const size_t N( B.columns() );
5788 const size_t K( A.columns() );
5790 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
5792 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
5795 const IntrinsicType factor(
set( scalar ) );
5800 for(
size_t j=0UL; j<N; ++j )
5802 const size_t kbegin( ( IsLower<MT5>::value )
5803 ?( ( IsUpper<MT4>::value )
5804 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5805 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5806 :( IsUpper<MT4>::value ? i : 0UL ) );
5807 const size_t kend( ( IsUpper<MT5>::value )
5808 ?( ( IsLower<MT4>::value )
5809 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
5810 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
5811 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
5813 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5815 for(
size_t k=kbegin; k<kend; ++k ) {
5816 const IntrinsicType b1(
set( B(k,j) ) );
5817 xmm1 = xmm1 + A.load(i ,k) * b1;
5818 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
5819 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
5820 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
5821 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
5822 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
5823 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
5824 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
5827 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5829 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) + xmm3 * factor );
5830 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) + xmm4 * factor );
5831 (~C).store( i+
IT::size*4UL, j, (~C).load(i+
IT::size*4UL,j) + xmm5 * factor );
5832 (~C).store( i+
IT::size*5UL, j, (~C).load(i+
IT::size*5UL,j) + xmm6 * factor );
5833 (~C).store( i+
IT::size*6UL, j, (~C).load(i+
IT::size*6UL,j) + xmm7 * factor );
5834 (~C).store( i+
IT::size*7UL, j, (~C).load(i+
IT::size*7UL,j) + xmm8 * factor );
5842 for( ; (j+2UL) <= N; j+=2UL )
5844 const size_t kbegin( ( IsLower<MT5>::value )
5845 ?( ( IsUpper<MT4>::value )
5846 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5847 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5848 :( IsUpper<MT4>::value ? i : 0UL ) );
5849 const size_t kend( ( IsUpper<MT5>::value )
5850 ?( ( IsLower<MT4>::value )
5851 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5852 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5853 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
5855 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5857 for(
size_t k=kbegin; k<kend; ++k ) {
5858 const IntrinsicType a1( A.load(i ,k) );
5859 const IntrinsicType a2( A.load(i+
IT::size ,k) );
5860 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
5861 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
5862 const IntrinsicType b1(
set( B(k,j ) ) );
5863 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5864 xmm1 = xmm1 + a1 * b1;
5865 xmm2 = xmm2 + a2 * b1;
5866 xmm3 = xmm3 + a3 * b1;
5867 xmm4 = xmm4 + a4 * b1;
5868 xmm5 = xmm5 + a1 * b2;
5869 xmm6 = xmm6 + a2 * b2;
5870 xmm7 = xmm7 + a3 * b2;
5871 xmm8 = xmm8 + a4 * b2;
5874 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5876 (~C).store( i+
IT::size*2UL, j , (~C).load(i+
IT::size*2UL,j ) + xmm3 * factor );
5877 (~C).store( i+
IT::size*3UL, j , (~C).load(i+
IT::size*3UL,j ) + xmm4 * factor );
5878 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
5879 (~C).store( i+
IT::size , j+1UL, (~C).load(i+
IT::size ,j+1UL) + xmm6 * factor );
5880 (~C).store( i+
IT::size*2UL, j+1UL, (~C).load(i+
IT::size*2UL,j+1UL) + xmm7 * factor );
5881 (~C).store( i+
IT::size*3UL, j+1UL, (~C).load(i+
IT::size*3UL,j+1UL) + xmm8 * factor );
5886 const size_t kbegin( ( IsLower<MT5>::value )
5887 ?( ( IsUpper<MT4>::value )
5888 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5889 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5890 :( IsUpper<MT4>::value ? i : 0UL ) );
5891 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
5893 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5895 for(
size_t k=kbegin; k<kend; ++k ) {
5896 const IntrinsicType b1(
set( B(k,j) ) );
5897 xmm1 = xmm1 + A.load(i ,k) * b1;
5898 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
5899 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
5900 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
5903 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5905 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) + xmm3 * factor );
5906 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) + xmm4 * factor );
5914 for( ; (j+2UL) <= N; j+=2UL )
5916 const size_t kbegin( ( IsLower<MT5>::value )
5917 ?( ( IsUpper<MT4>::value )
5918 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5919 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5920 :( IsUpper<MT4>::value ? i : 0UL ) );
5921 const size_t kend( ( IsUpper<MT5>::value )
5922 ?( ( IsLower<MT4>::value )
5923 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
5924 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
5925 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
5927 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5929 for(
size_t k=kbegin; k<kend; ++k ) {
5930 const IntrinsicType a1( A.load(i ,k) );
5931 const IntrinsicType a2( A.load(i+
IT::size,k) );
5932 const IntrinsicType b1(
set( B(k,j ) ) );
5933 const IntrinsicType b2(
set( B(k,j+1UL) ) );
5934 xmm1 = xmm1 + a1 * b1;
5935 xmm2 = xmm2 + a2 * b1;
5936 xmm3 = xmm3 + a1 * b2;
5937 xmm4 = xmm4 + a2 * b2;
5940 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5942 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
5943 (~C).store( i+
IT::size, j+1UL, (~C).load(i+
IT::size,j+1UL) + xmm4 * factor );
5948 const size_t kbegin( ( IsLower<MT5>::value )
5949 ?( ( IsUpper<MT4>::value )
5950 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5951 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5952 :( IsUpper<MT4>::value ? i : 0UL ) );
5953 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
5955 IntrinsicType xmm1, xmm2;
5957 for(
size_t k=kbegin; k<kend; ++k ) {
5958 const IntrinsicType b1(
set( B(k,j) ) );
5959 xmm1 = xmm1 + A.load(i ,k) * b1;
5960 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
5963 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5972 for( ; (j+2UL) <= N; j+=2UL )
5974 const size_t kbegin( ( IsLower<MT5>::value )
5975 ?( ( IsUpper<MT4>::value )
5976 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
5977 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
5978 :( IsUpper<MT4>::value ? i : 0UL ) );
5979 const size_t kend( ( IsUpper<MT5>::value )
5980 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
5983 IntrinsicType xmm1, xmm2;
5985 for(
size_t k=kbegin; k<kend; ++k ) {
5986 const IntrinsicType a1( A.load(i,k) );
5987 xmm1 = xmm1 + a1 *
set( B(k,j ) );
5988 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
5991 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5992 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
5997 const size_t kbegin( ( IsLower<MT5>::value )
5998 ?( ( IsUpper<MT4>::value )
5999 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6000 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6001 :( IsUpper<MT4>::value ? i : 0UL ) );
6005 for(
size_t k=kbegin; k<K; ++k ) {
6006 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
6009 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6013 for( ; remainder && i<M; ++i )
6017 for( ; (j+2UL) <= N; j+=2UL )
6019 const size_t kbegin( ( IsLower<MT5>::value )
6020 ?( ( IsUpper<MT4>::value )
6021 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6022 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6023 :( IsUpper<MT4>::value ? i : 0UL ) );
6024 const size_t kend( ( IsUpper<MT5>::value )
6025 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6031 for(
size_t k=kbegin; k<kend; ++k ) {
6032 value1 += A(i,k) * B(k,j );
6033 value2 += A(i,k) * B(k,j+1UL);
6036 (~C)(i,j ) += value1 * scalar;
6037 (~C)(i,j+1UL) += value2 * scalar;
6042 const size_t kbegin( ( IsLower<MT5>::value )
6043 ?( ( IsUpper<MT4>::value )
6044 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6045 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6046 :( IsUpper<MT4>::value ? i : 0UL ) );
6050 for(
size_t k=kbegin; k<K; ++k ) {
6051 value += A(i,k) * B(k,j);
6054 (~C)(i,j) += value * scalar;
6074 template<
typename MT3
6078 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6079 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6081 selectDefaultAddAssignKernel( C, A, B, scalar );
6100 template<
typename MT3
6104 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6105 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6107 selectSmallAddAssignKernel( ~C, A, B, scalar );
6126 template<
typename MT3
6130 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6131 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6133 typedef IntrinsicTrait<ElementType> IT;
6135 const size_t M( A.rows() );
6136 const size_t N( B.columns() );
6137 const size_t K( A.columns() );
6139 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6141 const IntrinsicType factor(
set( scalar ) );
6143 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
6145 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
6147 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
6150 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
6152 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
6154 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
6156 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
6168 for( ; (j+2UL) <= jend; j+=2UL )
6170 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6171 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6172 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
6173 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6175 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6177 for(
size_t k=kbegin; k<kend; ++k ) {
6178 const IntrinsicType a1( A.load(i ,k) );
6179 const IntrinsicType a2( A.load(i1,k) );
6180 const IntrinsicType a3( A.load(i2,k) );
6181 const IntrinsicType a4( A.load(i3,k) );
6182 const IntrinsicType b1(
set( B(k,j ) ) );
6183 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6184 xmm1 = xmm1 + a1 * b1;
6185 xmm2 = xmm2 + a2 * b1;
6186 xmm3 = xmm3 + a3 * b1;
6187 xmm4 = xmm4 + a4 * b1;
6188 xmm5 = xmm5 + a1 * b2;
6189 xmm6 = xmm6 + a2 * b2;
6190 xmm7 = xmm7 + a3 * b2;
6191 xmm8 = xmm8 + a4 * b2;
6194 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6195 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6196 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
6197 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
6198 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
6199 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
6200 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
6201 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
6206 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6207 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6208 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
6209 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6211 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6213 for(
size_t k=kbegin; k<kend; ++k ) {
6214 const IntrinsicType b1(
set( B(k,j) ) );
6215 xmm1 = xmm1 + A.load(i ,k) * b1;
6216 xmm2 = xmm2 + A.load(i1,k) * b1;
6217 xmm3 = xmm3 + A.load(i2,k) * b1;
6218 xmm4 = xmm4 + A.load(i3,k) * b1;
6221 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6222 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6223 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
6224 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
6234 for( ; (j+4UL) <= jend; j+=4UL )
6236 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6237 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6238 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6239 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
6241 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6243 for(
size_t k=kbegin; k<kend; ++k ) {
6244 const IntrinsicType a1( A.load(i ,k) );
6245 const IntrinsicType a2( A.load(i1,k) );
6246 const IntrinsicType b1(
set( B(k,j ) ) );
6247 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6248 const IntrinsicType b3(
set( B(k,j+2UL) ) );
6249 const IntrinsicType b4(
set( B(k,j+3UL) ) );
6250 xmm1 = xmm1 + a1 * b1;
6251 xmm2 = xmm2 + a2 * b1;
6252 xmm3 = xmm3 + a1 * b2;
6253 xmm4 = xmm4 + a2 * b2;
6254 xmm5 = xmm5 + a1 * b3;
6255 xmm6 = xmm6 + a2 * b3;
6256 xmm7 = xmm7 + a1 * b4;
6257 xmm8 = xmm8 + a2 * b4;
6260 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6261 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6262 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6263 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6264 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
6265 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
6266 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
6267 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
6270 for( ; (j+2UL) <= jend; j+=2UL )
6272 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6273 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6274 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6275 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
6277 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6279 for(
size_t k=kbegin; k<kend; ++k ) {
6280 const IntrinsicType a1( A.load(i ,k) );
6281 const IntrinsicType a2( A.load(i1,k) );
6282 const IntrinsicType b1(
set( B(k,j ) ) );
6283 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6284 xmm1 = xmm1 + a1 * b1;
6285 xmm2 = xmm2 + a2 * b1;
6286 xmm3 = xmm3 + a1 * b2;
6287 xmm4 = xmm4 + a2 * b2;
6290 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6291 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
6292 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
6293 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
6298 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6299 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6300 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
6301 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6303 IntrinsicType xmm1, xmm2;
6305 for(
size_t k=kbegin; k<kend; ++k ) {
6306 const IntrinsicType b1(
set( B(k,j) ) );
6307 xmm1 = xmm1 + A.load(i ,k) * b1;
6308 xmm2 = xmm2 + A.load(i1,k) * b1;
6311 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
6312 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
6318 for(
size_t j=jj; j<jend; ++j )
6320 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6321 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6322 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
6323 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6327 for(
size_t k=kbegin; k<kend; ++k ) {
6328 const IntrinsicType b1(
set( B(k,j) ) );
6329 xmm1 = xmm1 + A.load(i,k) * b1;
6332 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6336 for( ; remainder && i<iend; ++i )
6338 for(
size_t j=jj; j<jend; ++j )
6340 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6341 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6342 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
6343 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
6347 for(
size_t k=kbegin; k<kend; ++k ) {
6348 value += A(i,k) * B(k,j);
6351 (~C)(i,j) += value * scalar;
6375 template<
typename MT3
6379 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6380 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6382 selectLargeAddAssignKernel( C, A, B, scalar );
6401 template<
typename MT3
6405 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6406 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6410 if( IsTriangular<MT4>::value ) {
6412 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6413 addAssign( C, tmp );
6415 else if( IsTriangular<MT5>::value ) {
6417 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6418 addAssign( C, tmp );
6421 gemm( C, A, B, ET(scalar), ET(1) );
6442 template<
typename MT >
6443 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6444 addAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
6453 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6454 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6456 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6457 addAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
6458 else if( IsSymmetric<MT1>::value )
6459 addAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
6461 addAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
6481 template<
typename MT
6483 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6484 subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6491 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6492 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6494 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6508 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6523 template<
typename MT3
6527 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6529 if( ( IsDiagonal<MT4>::value ) ||
6530 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6531 selectSmallSubAssignKernel( C, A, B, scalar );
6533 selectBlasSubAssignKernel( C, A, B, scalar );
6551 template<
typename MT3
6555 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6556 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6558 const ResultType tmp(
serial( A * B * scalar ) );
6559 subAssign( C, tmp );
6577 template<
typename MT3
6581 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6582 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6586 const size_t M( A.rows() );
6587 const size_t N( B.columns() );
6589 for(
size_t j=0UL; j<N; ++j )
6591 const size_t ibegin( ( IsLower<MT4>::value )
6592 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6594 const size_t iend( ( IsUpper<MT4>::value )
6595 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6599 const size_t inum( iend - ibegin );
6600 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6602 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6603 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
6604 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
6607 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
6627 template<
typename MT3
6631 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6632 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6636 const size_t M( A.rows() );
6637 const size_t N( B.columns() );
6639 for(
size_t j=0UL; j<N; ++j )
6641 const size_t ibegin( ( IsLower<MT5>::value )
6642 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6644 const size_t iend( ( IsUpper<MT5>::value )
6645 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6649 const size_t inum( iend - ibegin );
6650 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6652 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6653 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
6654 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6657 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
6677 template<
typename MT3
6681 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6682 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6686 for(
size_t i=0UL; i<A.rows(); ++i ) {
6687 C(i,i) -= A(i,i) * B(i,i) * scalar;
6706 template<
typename MT3
6710 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6711 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6713 selectDefaultSubAssignKernel( C, A, B, scalar );
6732 template<
typename MT3
6736 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6737 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6744 if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6746 subAssign( ~C, A * tmp * scalar );
6748 else if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6750 subAssign( ~C, tmp * B * scalar );
6752 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6754 subAssign( ~C, A * tmp * scalar );
6758 subAssign( ~C, tmp * B * scalar );
6778 template<
typename MT3
6782 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6783 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6785 typedef IntrinsicTrait<ElementType> IT;
6787 const size_t M( A.rows() );
6788 const size_t N( B.columns() );
6789 const size_t K( A.columns() );
6791 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6793 const size_t ipos( remainder ? ( M &
size_t(-
IT::size) ) : M );
6796 const IntrinsicType factor(
set( scalar ) );
6801 for(
size_t j=0UL; j<N; ++j )
6803 const size_t kbegin( ( IsLower<MT5>::value )
6804 ?( ( IsUpper<MT4>::value )
6805 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6806 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6807 :( IsUpper<MT4>::value ? i : 0UL ) );
6808 const size_t kend( ( IsUpper<MT5>::value )
6809 ?( ( IsLower<MT4>::value )
6810 ?(
min( i+
IT::size*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6811 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6812 :( IsLower<MT4>::value ?
min( i+
IT::size*8UL, K ) : K ) );
6814 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6816 for(
size_t k=kbegin; k<kend; ++k ) {
6817 const IntrinsicType b1(
set( B(k,j) ) );
6818 xmm1 = xmm1 + A.load(i ,k) * b1;
6819 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
6820 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
6821 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
6822 xmm5 = xmm5 + A.load(i+
IT::size*4UL,k) * b1;
6823 xmm6 = xmm6 + A.load(i+
IT::size*5UL,k) * b1;
6824 xmm7 = xmm7 + A.load(i+
IT::size*6UL,k) * b1;
6825 xmm8 = xmm8 + A.load(i+
IT::size*7UL,k) * b1;
6828 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6830 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) - xmm3 * factor );
6831 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) - xmm4 * factor );
6832 (~C).store( i+
IT::size*4UL, j, (~C).load(i+
IT::size*4UL,j) - xmm5 * factor );
6833 (~C).store( i+
IT::size*5UL, j, (~C).load(i+
IT::size*5UL,j) - xmm6 * factor );
6834 (~C).store( i+
IT::size*6UL, j, (~C).load(i+
IT::size*6UL,j) - xmm7 * factor );
6835 (~C).store( i+
IT::size*7UL, j, (~C).load(i+
IT::size*7UL,j) - xmm8 * factor );
6843 for( ; (j+2UL) <= N; j+=2UL )
6845 const size_t kbegin( ( IsLower<MT5>::value )
6846 ?( ( IsUpper<MT4>::value )
6847 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6848 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6849 :( IsUpper<MT4>::value ? i : 0UL ) );
6850 const size_t kend( ( IsUpper<MT5>::value )
6851 ?( ( IsLower<MT4>::value )
6852 ?(
min( i+
IT::size*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6853 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6854 :( IsLower<MT4>::value ?
min( i+
IT::size*4UL, K ) : K ) );
6856 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6858 for(
size_t k=kbegin; k<kend; ++k ) {
6859 const IntrinsicType a1( A.load(i ,k) );
6860 const IntrinsicType a2( A.load(i+
IT::size ,k) );
6861 const IntrinsicType a3( A.load(i+
IT::size*2UL,k) );
6862 const IntrinsicType a4( A.load(i+
IT::size*3UL,k) );
6863 const IntrinsicType b1(
set( B(k,j ) ) );
6864 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6865 xmm1 = xmm1 + a1 * b1;
6866 xmm2 = xmm2 + a2 * b1;
6867 xmm3 = xmm3 + a3 * b1;
6868 xmm4 = xmm4 + a4 * b1;
6869 xmm5 = xmm5 + a1 * b2;
6870 xmm6 = xmm6 + a2 * b2;
6871 xmm7 = xmm7 + a3 * b2;
6872 xmm8 = xmm8 + a4 * b2;
6875 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6877 (~C).store( i+
IT::size*2UL, j , (~C).load(i+
IT::size*2UL,j ) - xmm3 * factor );
6878 (~C).store( i+
IT::size*3UL, j , (~C).load(i+
IT::size*3UL,j ) - xmm4 * factor );
6879 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
6880 (~C).store( i+
IT::size , j+1UL, (~C).load(i+
IT::size ,j+1UL) - xmm6 * factor );
6881 (~C).store( i+
IT::size*2UL, j+1UL, (~C).load(i+
IT::size*2UL,j+1UL) - xmm7 * factor );
6882 (~C).store( i+
IT::size*3UL, j+1UL, (~C).load(i+
IT::size*3UL,j+1UL) - xmm8 * factor );
6887 const size_t kbegin( ( IsLower<MT5>::value )
6888 ?( ( IsUpper<MT4>::value )
6889 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6890 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6891 :( IsUpper<MT4>::value ? i : 0UL ) );
6892 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, K ) ):( K ) );
6894 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6896 for(
size_t k=kbegin; k<kend; ++k ) {
6897 const IntrinsicType b1(
set( B(k,j) ) );
6898 xmm1 = xmm1 + A.load(i ,k) * b1;
6899 xmm2 = xmm2 + A.load(i+
IT::size ,k) * b1;
6900 xmm3 = xmm3 + A.load(i+
IT::size*2UL,k) * b1;
6901 xmm4 = xmm4 + A.load(i+
IT::size*3UL,k) * b1;
6904 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6906 (~C).store( i+
IT::size*2UL, j, (~C).load(i+
IT::size*2UL,j) - xmm3 * factor );
6907 (~C).store( i+
IT::size*3UL, j, (~C).load(i+
IT::size*3UL,j) - xmm4 * factor );
6915 for( ; (j+2UL) <= N; j+=2UL )
6917 const size_t kbegin( ( IsLower<MT5>::value )
6918 ?( ( IsUpper<MT4>::value )
6919 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6920 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6921 :( IsUpper<MT4>::value ? i : 0UL ) );
6922 const size_t kend( ( IsUpper<MT5>::value )
6923 ?( ( IsLower<MT4>::value )
6924 ?(
min( i+
IT::size*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
6925 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
6926 :( IsLower<MT4>::value ?
min( i+
IT::size*2UL, K ) : K ) );
6928 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6930 for(
size_t k=kbegin; k<kend; ++k ) {
6931 const IntrinsicType a1( A.load(i ,k) );
6932 const IntrinsicType a2( A.load(i+
IT::size,k) );
6933 const IntrinsicType b1(
set( B(k,j ) ) );
6934 const IntrinsicType b2(
set( B(k,j+1UL) ) );
6935 xmm1 = xmm1 + a1 * b1;
6936 xmm2 = xmm2 + a2 * b1;
6937 xmm3 = xmm3 + a1 * b2;
6938 xmm4 = xmm4 + a2 * b2;
6941 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6943 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
6944 (~C).store( i+
IT::size, j+1UL, (~C).load(i+
IT::size,j+1UL) - xmm4 * factor );
6949 const size_t kbegin( ( IsLower<MT5>::value )
6950 ?( ( IsUpper<MT4>::value )
6951 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6952 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6953 :( IsUpper<MT4>::value ? i : 0UL ) );
6954 const size_t kend( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, K ) ):( K ) );
6956 IntrinsicType xmm1, xmm2;
6958 for(
size_t k=kbegin; k<kend; ++k ) {
6959 const IntrinsicType b1(
set( B(k,j) ) );
6960 xmm1 = xmm1 + A.load(i ,k) * b1;
6961 xmm2 = xmm2 + A.load(i+
IT::size,k) * b1;
6964 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6973 for( ; (j+2UL) <= N; j+=2UL )
6975 const size_t kbegin( ( IsLower<MT5>::value )
6976 ?( ( IsUpper<MT4>::value )
6977 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6978 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6979 :( IsUpper<MT4>::value ? i : 0UL ) );
6980 const size_t kend( ( IsUpper<MT5>::value )
6981 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
6984 IntrinsicType xmm1, xmm2;
6986 for(
size_t k=kbegin; k<kend; ++k ) {
6987 const IntrinsicType a1( A.load(i,k) );
6988 xmm1 = xmm1 + a1 *
set( B(k,j ) );
6989 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
6992 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6993 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
6998 const size_t kbegin( ( IsLower<MT5>::value )
6999 ?( ( IsUpper<MT4>::value )
7000 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7001 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7002 :( IsUpper<MT4>::value ? i : 0UL ) );
7006 for(
size_t k=kbegin; k<K; ++k ) {
7007 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
7010 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7014 for( ; remainder && i<M; ++i )
7018 for( ; (j+2UL) <= N; j+=2UL )
7020 const size_t kbegin( ( IsLower<MT5>::value )
7021 ?( ( IsUpper<MT4>::value )
7022 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7023 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7024 :( IsUpper<MT4>::value ? i : 0UL ) );
7025 const size_t kend( ( IsUpper<MT5>::value )
7026 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7032 for(
size_t k=kbegin; k<kend; ++k ) {
7033 value1 += A(i,k) * B(k,j );
7034 value2 += A(i,k) * B(k,j+1UL);
7037 (~C)(i,j ) -= value1 * scalar;
7038 (~C)(i,j+1UL) -= value2 * scalar;
7043 const size_t kbegin( ( IsLower<MT5>::value )
7044 ?( ( IsUpper<MT4>::value )
7045 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7046 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7047 :( IsUpper<MT4>::value ? i : 0UL ) );
7051 for(
size_t k=kbegin; k<K; ++k ) {
7052 value += A(i,k) * B(k,j);
7055 (~C)(i,j) -= value * scalar;
7075 template<
typename MT3
7079 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7080 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7082 selectDefaultSubAssignKernel( C, A, B, scalar );
7101 template<
typename MT3
7105 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7106 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7108 selectSmallSubAssignKernel( ~C, A, B, scalar );
7127 template<
typename MT3
7131 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7132 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7134 typedef IntrinsicTrait<ElementType> IT;
7136 const size_t M( A.rows() );
7137 const size_t N( B.columns() );
7138 const size_t K( A.columns() );
7140 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7142 const IntrinsicType factor(
set( scalar ) );
7144 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_IBLOCK_SIZE )
7146 const size_t iend(
min( ii+TDMATTDMATMULT_IBLOCK_SIZE, M ) );
7148 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
7151 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_JBLOCK_SIZE )
7153 const size_t jend(
min( jj+TDMATTDMATMULT_JBLOCK_SIZE, N ) );
7155 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_KBLOCK_SIZE )
7157 const size_t ktmp(
min( kk+TDMATTDMATMULT_KBLOCK_SIZE, K ) );
7169 for( ; (j+2UL) <= jend; j+=2UL )
7171 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7172 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7173 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7174 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7176 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7178 for(
size_t k=kbegin; k<kend; ++k ) {
7179 const IntrinsicType a1( A.load(i ,k) );
7180 const IntrinsicType a2( A.load(i1,k) );
7181 const IntrinsicType a3( A.load(i2,k) );
7182 const IntrinsicType a4( A.load(i3,k) );
7183 const IntrinsicType b1(
set( B(k,j ) ) );
7184 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7185 xmm1 = xmm1 + a1 * b1;
7186 xmm2 = xmm2 + a2 * b1;
7187 xmm3 = xmm3 + a3 * b1;
7188 xmm4 = xmm4 + a4 * b1;
7189 xmm5 = xmm5 + a1 * b2;
7190 xmm6 = xmm6 + a2 * b2;
7191 xmm7 = xmm7 + a3 * b2;
7192 xmm8 = xmm8 + a4 * b2;
7195 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7196 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7197 (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
7198 (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
7199 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
7200 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
7201 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
7202 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
7207 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7208 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7209 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*4UL, ktmp ) ):( ktmp ),
7210 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7212 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7214 for(
size_t k=kbegin; k<kend; ++k ) {
7215 const IntrinsicType b1(
set( B(k,j) ) );
7216 xmm1 = xmm1 + A.load(i ,k) * b1;
7217 xmm2 = xmm2 + A.load(i1,k) * b1;
7218 xmm3 = xmm3 + A.load(i2,k) * b1;
7219 xmm4 = xmm4 + A.load(i3,k) * b1;
7222 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7223 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7224 (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
7225 (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
7235 for( ; (j+4UL) <= jend; j+=4UL )
7237 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7238 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7239 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7240 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7242 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7244 for(
size_t k=kbegin; k<kend; ++k ) {
7245 const IntrinsicType a1( A.load(i ,k) );
7246 const IntrinsicType a2( A.load(i1,k) );
7247 const IntrinsicType b1(
set( B(k,j ) ) );
7248 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7249 const IntrinsicType b3(
set( B(k,j+2UL) ) );
7250 const IntrinsicType b4(
set( B(k,j+3UL) ) );
7251 xmm1 = xmm1 + a1 * b1;
7252 xmm2 = xmm2 + a2 * b1;
7253 xmm3 = xmm3 + a1 * b2;
7254 xmm4 = xmm4 + a2 * b2;
7255 xmm5 = xmm5 + a1 * b3;
7256 xmm6 = xmm6 + a2 * b3;
7257 xmm7 = xmm7 + a1 * b4;
7258 xmm8 = xmm8 + a2 * b4;
7261 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7262 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7263 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7264 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7265 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
7266 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
7267 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
7268 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
7271 for( ; (j+2UL) <= jend; j+=2UL )
7273 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7274 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7275 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7276 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7278 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7280 for(
size_t k=kbegin; k<kend; ++k ) {
7281 const IntrinsicType a1( A.load(i ,k) );
7282 const IntrinsicType a2( A.load(i1,k) );
7283 const IntrinsicType b1(
set( B(k,j ) ) );
7284 const IntrinsicType b2(
set( B(k,j+1UL) ) );
7285 xmm1 = xmm1 + a1 * b1;
7286 xmm2 = xmm2 + a2 * b1;
7287 xmm3 = xmm3 + a1 * b2;
7288 xmm4 = xmm4 + a2 * b2;
7291 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7292 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
7293 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
7294 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
7299 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7300 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7301 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size*2UL, ktmp ) ):( ktmp ),
7302 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7304 IntrinsicType xmm1, xmm2;
7306 for(
size_t k=kbegin; k<kend; ++k ) {
7307 const IntrinsicType b1(
set( B(k,j) ) );
7308 xmm1 = xmm1 + A.load(i ,k) * b1;
7309 xmm2 = xmm2 + A.load(i1,k) * b1;
7312 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
7313 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
7319 for(
size_t j=jj; j<jend; ++j )
7321 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7322 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7323 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+
IT::size, ktmp ) ):( ktmp ),
7324 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7328 for(
size_t k=kbegin; k<kend; ++k ) {
7329 const IntrinsicType b1(
set( B(k,j) ) );
7330 xmm1 = xmm1 + A.load(i,k) * b1;
7333 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7337 for( ; remainder && i<iend; ++i )
7339 for(
size_t j=jj; j<jend; ++j )
7341 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7342 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7343 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
7344 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7348 for(
size_t k=kbegin; k<kend; ++k ) {
7349 value += A(i,k) * B(k,j);
7352 (~C)(i,j) -= value * scalar;
7376 template<
typename MT3
7380 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7381 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7383 selectLargeSubAssignKernel( C, A, B, scalar );
7402 template<
typename MT3
7406 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7407 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7411 if( IsTriangular<MT4>::value ) {
7413 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7414 subAssign( C, tmp );
7416 else if( IsTriangular<MT5>::value ) {
7418 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7419 subAssign( C, tmp );
7422 gemm( C, A, B, ET(-scalar), ET(1) );
7442 template<
typename MT >
7443 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7444 subAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7453 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7454 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7456 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7457 subAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
7458 else if( IsSymmetric<MT1>::value )
7459 subAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
7461 subAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
7492 template<
typename MT
7494 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7495 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7502 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7503 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7505 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7508 else if( left.columns() == 0UL ) {
7542 template<
typename MT
7544 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7545 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7549 typedef typename SelectType< SO, ResultType, OppositeType >::Type TmpType;
7561 const TmpType tmp( rhs );
7580 template<
typename MT >
7581 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7582 smpAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7591 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7592 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7594 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7596 else if( IsSymmetric<MT1>::value )
7618 template<
typename MT
7620 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7621 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7628 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7629 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7631 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7664 template<
typename MT >
7665 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7666 smpAddAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7675 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7676 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7678 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7680 else if( IsSymmetric<MT1>::value )
7706 template<
typename MT
7708 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7709 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7716 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7717 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7719 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7752 template<
typename MT >
7753 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7754 smpSubAssign( Matrix<MT,false>& lhs,
const DMatScalarMultExpr& rhs )
7763 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7764 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7766 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7768 else if( IsSymmetric<MT1>::value )
7836 template<
typename T1
7838 inline const TDMatTDMatMultExpr<T1,T2>
7862 template<
typename MT1,
typename MT2 >
7879 template<
typename MT1,
typename MT2 >
7896 template<
typename MT1,
typename MT2 >
7898 :
public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7914 template<
typename MT1,
typename MT2 >
7916 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7932 template<
typename MT1,
typename MT2 >
7934 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7950 template<
typename MT1,
typename MT2 >
7952 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7953 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7969 template<
typename MT1,
typename MT2 >
7971 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7987 template<
typename MT1,
typename MT2 >
7989 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
8005 template<
typename MT1,
typename MT2 >
8007 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
8008 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8024 template<
typename MT1,
typename MT2,
typename VT >
8029 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8030 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8031 IsDenseVector<VT>::value && IsColumnVector<VT>::value
8032 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatDVecMultExprTrait<MT2,VT>::Type >::Type
8033 , INVALID_TYPE >::Type Type;
8042 template<
typename MT1,
typename MT2,
typename VT >
8047 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8048 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value &&
8049 IsSparseVector<VT>::value && IsColumnVector<VT>::value
8050 ,
typename TDMatDVecMultExprTrait< MT1, typename TDMatSVecMultExprTrait<MT2,VT>::Type >::Type
8051 , INVALID_TYPE >::Type Type;
8060 template<
typename VT,
typename MT1,
typename MT2 >
8065 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8066 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8067 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8068 ,
typename TDVecTDMatMultExprTrait< typename TDVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8069 , INVALID_TYPE >::Type Type;
8078 template<
typename VT,
typename MT1,
typename MT2 >
8083 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8084 IsDenseMatrix<MT1>::value && IsColumnMajorMatrix<MT1>::value &&
8085 IsDenseMatrix<MT2>::value && IsColumnMajorMatrix<MT2>::value
8086 ,
typename TDVecTDMatMultExprTrait< typename TSVecTDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8087 , INVALID_TYPE >::Type Type;
8096 template<
typename MT1,
typename MT2,
bool AF >
8101 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8102 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8111 template<
typename MT1,
typename MT2 >
8116 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8125 template<
typename MT1,
typename MT2 >
8130 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:295
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:246
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:252
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:249
Header file for basic type definitions.
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:240
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:150
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:351
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatTDMatMultExpr.h:244
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the And class template.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:450
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:144
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:241
Header file for the IsUniLower type trait.
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2584
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:153
TDMatTDMatMultExpr< MT1, MT2 > This
Type of this TDMatTDMatMultExpr instance.
Definition: TDMatTDMatMultExpr.h:239
Header file for the IsComplexDouble type trait.
size_t columns() const
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:377
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:397
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the TSVecTDMatMultExprTrait class template.
Header file for the Or class template.
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:245
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:431
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:421
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:148
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:280
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:149
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:441
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraint on the data type.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:151
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
size_t rows() const
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:367
Header file for all intrinsic functionality.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:409
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:243
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:451
Header file for the IsRowMajorMatrix type trait.
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:944
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:387
Header file for the complex data type.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:258
Header file for the IsUpper type trait.
Header file for exception macros.
Header file for the IsColumnVector type trait.
Constraint on the data type.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:242
Header file for the IsResizable type trait.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:255
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:152