35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
142 template<
typename MT1
176 template<
typename T1,
typename T2,
typename T3 >
177 struct CanExploitSymmetry {
178 enum { value = IsColumnMajorMatrix<T1>::value &&
179 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
190 template<
typename T1,
typename T2,
typename T3 >
191 struct IsEvaluationRequired {
192 enum { value = ( evaluateLeft || evaluateRight ) &&
193 !CanExploitSymmetry<T1,T2,T3>::value };
203 template<
typename T1,
typename T2,
typename T3 >
204 struct UseBlasKernel {
206 HasMutableDataAccess<T1>::value &&
207 HasConstDataAccess<T2>::value &&
208 HasConstDataAccess<T3>::value &&
209 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
210 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
211 IsBlasCompatible<typename T1::ElementType>::value &&
212 IsBlasCompatible<typename T2::ElementType>::value &&
213 IsBlasCompatible<typename T3::ElementType>::value &&
214 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
215 IsSame< typename T1::ElementType, typename T3::ElementType >::value };
225 template<
typename T1,
typename T2,
typename T3 >
226 struct UseVectorizedDefaultKernel {
228 !IsDiagonal<T3>::value &&
229 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
230 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
231 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
232 IntrinsicTrait<typename T1::ElementType>::addition &&
233 IntrinsicTrait<typename T1::ElementType>::subtraction &&
234 IntrinsicTrait<typename T1::ElementType>::multiplication };
266 MT1::vectorizable && MT2::vectorizable &&
272 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
273 !evaluateRight && MT2::smpAssignable };
316 :(
lhs_.columns() ) ) );
318 if(
lhs_.columns() == 0UL ||
328 const size_t knum( kend - kbegin );
329 const size_t kpos( kbegin + ( ( knum - 1UL ) &
size_t(-2) ) + 1UL );
331 ElementType tmp(
lhs_(i,kbegin) *
rhs_(kbegin,j) );
333 for(
size_t k=kbegin+1UL; k<kpos; k+=2UL ) {
335 tmp +=
lhs_(i,k+1UL) *
rhs_(k+1UL,j);
353 inline ReturnType
at(
size_t i,
size_t j )
const {
354 if( i >=
lhs_.rows() ) {
357 if( j >=
rhs_.columns() ) {
380 return rhs_.columns();
410 template<
typename T >
412 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
422 template<
typename T >
424 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
434 return lhs_.isAligned() &&
rhs_.isAligned();
445 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
446 (
rows() > SMP_DMATDMATMULT_THRESHOLD );
469 template<
typename MT
479 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
482 else if( rhs.
lhs_.columns() == 0UL ) {
497 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
513 template<
typename MT3
516 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
519 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
520 selectSmallAssignKernel( C, A, B );
522 selectBlasAssignKernel( C, A, B );
541 template<
typename MT3
544 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
545 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
547 const size_t M( A.rows() );
548 const size_t N( B.columns() );
549 const size_t K( A.columns() );
551 for(
size_t i=0UL; i<M; ++i )
553 const size_t kbegin( ( IsUpper<MT4>::value )
554 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
556 const size_t kend( ( IsLower<MT4>::value )
557 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
561 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
562 for(
size_t j=0UL; j<N; ++j ) {
569 const size_t jbegin( ( IsUpper<MT5>::value )
570 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
572 const size_t jend( ( IsLower<MT5>::value )
573 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
577 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
578 for(
size_t j=0UL; j<jbegin; ++j ) {
582 else if( IsStrictlyUpper<MT5>::value ) {
585 for(
size_t j=jbegin; j<jend; ++j ) {
586 C(i,j) = A(i,kbegin) * B(kbegin,j);
588 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
589 for(
size_t j=jend; j<N; ++j ) {
593 else if( IsStrictlyLower<MT5>::value ) {
598 for(
size_t k=kbegin+1UL; k<kend; ++k )
600 const size_t jbegin( ( IsUpper<MT5>::value )
601 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
603 const size_t jend( ( IsLower<MT5>::value )
604 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
608 for(
size_t j=jbegin; j<jend; ++j ) {
609 C(i,j) += A(i,k) * B(k,j);
611 if( IsLower<MT5>::value ) {
612 C(i,jend) = A(i,k) * B(k,jend);
634 template<
typename MT3
637 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
638 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
642 const size_t M( A.rows() );
643 const size_t N( B.columns() );
645 for(
size_t i=0UL; i<M; ++i )
647 const size_t jbegin( ( IsUpper<MT4>::value )
648 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
650 const size_t jend( ( IsLower<MT4>::value )
651 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
655 if( IsUpper<MT4>::value ) {
656 for(
size_t j=0UL; j<jbegin; ++j ) {
660 for(
size_t j=jbegin; j<jend; ++j ) {
661 C(i,j) = A(i,j) * B(j,j);
663 if( IsLower<MT4>::value ) {
664 for(
size_t j=jend; j<N; ++j ) {
687 template<
typename MT3
690 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
691 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
695 const size_t M( A.rows() );
696 const size_t N( B.columns() );
698 for(
size_t i=0UL; i<M; ++i )
700 const size_t jbegin( ( IsUpper<MT5>::value )
701 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
703 const size_t jend( ( IsLower<MT5>::value )
704 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
708 if( IsUpper<MT5>::value ) {
709 for(
size_t j=0UL; j<jbegin; ++j ) {
713 for(
size_t j=jbegin; j<jend; ++j ) {
714 C(i,j) = A(i,i) * B(i,j);
716 if( IsLower<MT5>::value ) {
717 for(
size_t j=jend; j<N; ++j ) {
740 template<
typename MT3
743 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
744 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
750 for(
size_t i=0UL; i<A.rows(); ++i ) {
751 C(i,i) = A(i,i) * B(i,i);
770 template<
typename MT3
773 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
774 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
776 selectDefaultAssignKernel( C, A, B );
796 template<
typename MT3
799 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
800 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
802 typedef IntrinsicTrait<ElementType> IT;
804 const size_t M( A.rows() );
805 const size_t N( B.columns() );
806 const size_t K( A.columns() );
808 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
810 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
816 for(
size_t i=0UL; i<M; ++i )
818 const size_t kbegin( ( IsUpper<MT4>::value )
819 ?( ( IsLower<MT5>::value )
820 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
821 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
822 :( IsLower<MT5>::value ? j : 0UL ) );
823 const size_t kend( ( IsLower<MT4>::value )
824 ?( ( IsUpper<MT5>::value )
825 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
826 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
827 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
829 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
831 for(
size_t k=kbegin; k<kend; ++k ) {
832 const IntrinsicType a1(
set( A(i,k) ) );
833 xmm1 = xmm1 + a1 * B.load(k,j );
834 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
835 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
836 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
837 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
838 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
839 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
840 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
843 (~C).store( i, j , xmm1 );
845 (~C).store( i, j+
IT::size*2UL, xmm3 );
846 (~C).store( i, j+
IT::size*3UL, xmm4 );
847 (~C).store( i, j+
IT::size*4UL, xmm5 );
848 (~C).store( i, j+
IT::size*5UL, xmm6 );
849 (~C).store( i, j+
IT::size*6UL, xmm7 );
850 (~C).store( i, j+
IT::size*7UL, xmm8 );
858 for( ; (i+2UL) <= M; i+=2UL )
860 const size_t kbegin( ( IsUpper<MT4>::value )
861 ?( ( IsLower<MT5>::value )
862 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
863 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
864 :( IsLower<MT5>::value ? j : 0UL ) );
865 const size_t kend( ( IsLower<MT4>::value )
866 ?( ( IsUpper<MT5>::value )
867 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
868 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
869 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
871 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
873 for(
size_t k=kbegin; k<kend; ++k ) {
874 const IntrinsicType a1(
set( A(i ,k) ) );
875 const IntrinsicType a2(
set( A(i+1UL,k) ) );
876 const IntrinsicType b1( B.load(k,j ) );
877 const IntrinsicType b2( B.load(k,j+
IT::size ) );
878 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
879 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
880 xmm1 = xmm1 + a1 * b1;
881 xmm2 = xmm2 + a1 * b2;
882 xmm3 = xmm3 + a1 * b3;
883 xmm4 = xmm4 + a1 * b4;
884 xmm5 = xmm5 + a2 * b1;
885 xmm6 = xmm6 + a2 * b2;
886 xmm7 = xmm7 + a2 * b3;
887 xmm8 = xmm8 + a2 * b4;
890 (~C).store( i , j , xmm1 );
891 (~C).store( i , j+
IT::size , xmm2 );
892 (~C).store( i , j+
IT::size*2UL, xmm3 );
893 (~C).store( i , j+
IT::size*3UL, xmm4 );
894 (~C).store( i+1UL, j , xmm5 );
895 (~C).store( i+1UL, j+
IT::size , xmm6 );
896 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
897 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
902 const size_t kbegin( ( IsUpper<MT4>::value )
903 ?( ( IsLower<MT5>::value )
904 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
905 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
906 :( IsLower<MT5>::value ? j : 0UL ) );
907 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
909 IntrinsicType xmm1, xmm2, xmm3, xmm4;
911 for(
size_t k=kbegin; k<kend; ++k ) {
912 const IntrinsicType a1(
set( A(i,k) ) );
913 xmm1 = xmm1 + a1 * B.load(k,j );
914 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
915 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
916 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
919 (~C).store( i, j , xmm1 );
921 (~C).store( i, j+
IT::size*2UL, xmm3 );
922 (~C).store( i, j+
IT::size*3UL, xmm4 );
930 for( ; (i+2UL) <= M; i+=2UL )
932 const size_t kbegin( ( IsUpper<MT4>::value )
933 ?( ( IsLower<MT5>::value )
934 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
935 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
936 :( IsLower<MT5>::value ? j : 0UL ) );
937 const size_t kend( ( IsLower<MT4>::value )
938 ?( ( IsUpper<MT5>::value )
939 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
940 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
941 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
943 IntrinsicType xmm1, xmm2, xmm3, xmm4;
945 for(
size_t k=kbegin; k<kend; ++k ) {
946 const IntrinsicType a1(
set( A(i ,k) ) );
947 const IntrinsicType a2(
set( A(i+1UL,k) ) );
948 const IntrinsicType b1( B.load(k,j ) );
949 const IntrinsicType b2( B.load(k,j+
IT::size) );
950 xmm1 = xmm1 + a1 * b1;
951 xmm2 = xmm2 + a1 * b2;
952 xmm3 = xmm3 + a2 * b1;
953 xmm4 = xmm4 + a2 * b2;
956 (~C).store( i , j , xmm1 );
958 (~C).store( i+1UL, j , xmm3 );
959 (~C).store( i+1UL, j+
IT::size, xmm4 );
964 const size_t kbegin( ( IsUpper<MT4>::value )
965 ?( ( IsLower<MT5>::value )
966 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
967 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
968 :( IsLower<MT5>::value ? j : 0UL ) );
969 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
971 IntrinsicType xmm1, xmm2;
973 for(
size_t k=kbegin; k<kend; ++k ) {
974 const IntrinsicType a1(
set( A(i,k) ) );
975 xmm1 = xmm1 + a1 * B.load(k,j );
976 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
979 (~C).store( i, j , xmm1 );
988 for( ; (i+2UL) <= M; i+=2UL )
990 const size_t kbegin( ( IsUpper<MT4>::value )
991 ?( ( IsLower<MT5>::value )
992 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
993 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
994 :( IsLower<MT5>::value ? j : 0UL ) );
995 const size_t kend( ( IsLower<MT4>::value )
996 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
999 IntrinsicType xmm1, xmm2;
1001 for(
size_t k=kbegin; k<kend; ++k ) {
1002 const IntrinsicType b1( B.load(k,j) );
1003 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1004 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1007 (~C).store( i , j, xmm1 );
1008 (~C).store( i+1UL, j, xmm2 );
1013 const size_t kbegin( ( IsUpper<MT4>::value )
1014 ?( ( IsLower<MT5>::value )
1015 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1016 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1017 :( IsLower<MT5>::value ? j : 0UL ) );
1021 for(
size_t k=kbegin; k<K; ++k ) {
1022 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1025 (~C).store( i, j, xmm1 );
1029 for( ; remainder && j<N; ++j )
1033 for( ; (i+2UL) <= M; i+=2UL )
1035 const size_t kbegin( ( IsUpper<MT4>::value )
1036 ?( ( IsLower<MT5>::value )
1037 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1038 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1039 :( IsLower<MT5>::value ? j : 0UL ) );
1040 const size_t kend( ( IsLower<MT4>::value )
1041 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1047 for(
size_t k=kbegin; k<kend; ++k ) {
1048 value1 += A(i ,k) * B(k,j);
1049 value2 += A(i+1UL,k) * B(k,j);
1052 (~C)(i ,j) = value1;
1053 (~C)(i+1UL,j) = value2;
1058 const size_t kbegin( ( IsUpper<MT4>::value )
1059 ?( ( IsLower<MT5>::value )
1060 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1061 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1062 :( IsLower<MT5>::value ? j : 0UL ) );
1066 for(
size_t k=kbegin; k<K; ++k ) {
1067 value += A(i,k) * B(k,j);
1092 template<
typename MT3
1095 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1096 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1103 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
1105 assign( ~C, tmp * B );
1107 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
1109 assign( ~C, A * tmp );
1111 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1113 assign( ~C, tmp * B );
1117 assign( ~C, A * tmp );
1136 template<
typename MT3
1139 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1140 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1142 selectDefaultAssignKernel( C, A, B );
1162 template<
typename MT3
1165 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1166 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1168 typedef IntrinsicTrait<ElementType> IT;
1170 const size_t M( A.rows() );
1171 const size_t N( B.columns() );
1172 const size_t K( A.columns() );
1174 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1176 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
1178 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
1180 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1183 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
1185 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
1187 for(
size_t i=ii; i<iend; ++i ) {
1188 for(
size_t j=jj; j<jend; ++j ) {
1193 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
1195 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
1207 for( ; (i+2UL) <= iend; i+=2UL )
1209 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1210 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1211 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1212 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1214 IntrinsicType xmm1( (~C).load(i ,j ) );
1215 IntrinsicType xmm2( (~C).load(i ,j1) );
1216 IntrinsicType xmm3( (~C).load(i ,j2) );
1217 IntrinsicType xmm4( (~C).load(i ,j3) );
1218 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1219 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
1220 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
1221 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
1223 for(
size_t k=kbegin; k<kend; ++k ) {
1224 const IntrinsicType a1(
set( A(i ,k) ) );
1225 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1226 const IntrinsicType b1( B.load(k,j ) );
1227 const IntrinsicType b2( B.load(k,j1) );
1228 const IntrinsicType b3( B.load(k,j2) );
1229 const IntrinsicType b4( B.load(k,j3) );
1230 xmm1 = xmm1 + a1 * b1;
1231 xmm2 = xmm2 + a1 * b2;
1232 xmm3 = xmm3 + a1 * b3;
1233 xmm4 = xmm4 + a1 * b4;
1234 xmm5 = xmm5 + a2 * b1;
1235 xmm6 = xmm6 + a2 * b2;
1236 xmm7 = xmm7 + a2 * b3;
1237 xmm8 = xmm8 + a2 * b4;
1240 (~C).store( i , j , xmm1 );
1241 (~C).store( i , j1, xmm2 );
1242 (~C).store( i , j2, xmm3 );
1243 (~C).store( i , j3, xmm4 );
1244 (~C).store( i+1UL, j , xmm5 );
1245 (~C).store( i+1UL, j1, xmm6 );
1246 (~C).store( i+1UL, j2, xmm7 );
1247 (~C).store( i+1UL, j3, xmm8 );
1252 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1253 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1254 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1255 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
1257 IntrinsicType xmm1( (~C).load(i,j ) );
1258 IntrinsicType xmm2( (~C).load(i,j1) );
1259 IntrinsicType xmm3( (~C).load(i,j2) );
1260 IntrinsicType xmm4( (~C).load(i,j3) );
1262 for(
size_t k=kbegin; k<kend; ++k ) {
1263 const IntrinsicType a1(
set( A(i,k) ) );
1264 xmm1 = xmm1 + a1 * B.load(k,j );
1265 xmm2 = xmm2 + a1 * B.load(k,j1);
1266 xmm3 = xmm3 + a1 * B.load(k,j2);
1267 xmm4 = xmm4 + a1 * B.load(k,j3);
1270 (~C).store( i, j , xmm1 );
1271 (~C).store( i, j1, xmm2 );
1272 (~C).store( i, j2, xmm3 );
1273 (~C).store( i, j3, xmm4 );
1283 for( ; (i+4UL) <= iend; i+=4UL )
1285 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1286 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1287 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1288 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1290 IntrinsicType xmm1( (~C).load(i ,j ) );
1291 IntrinsicType xmm2( (~C).load(i ,j1) );
1292 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1293 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1294 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
1295 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
1296 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
1297 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
1299 for(
size_t k=kbegin; k<kend; ++k ) {
1300 const IntrinsicType a1(
set( A(i ,k) ) );
1301 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1302 const IntrinsicType a3(
set( A(i+2UL,k) ) );
1303 const IntrinsicType a4(
set( A(i+3UL,k) ) );
1304 const IntrinsicType b1( B.load(k,j ) );
1305 const IntrinsicType b2( B.load(k,j1) );
1306 xmm1 = xmm1 + a1 * b1;
1307 xmm2 = xmm2 + a1 * b2;
1308 xmm3 = xmm3 + a2 * b1;
1309 xmm4 = xmm4 + a2 * b2;
1310 xmm5 = xmm5 + a3 * b1;
1311 xmm6 = xmm6 + a3 * b2;
1312 xmm7 = xmm7 + a4 * b1;
1313 xmm8 = xmm8 + a4 * b2;
1316 (~C).store( i , j , xmm1 );
1317 (~C).store( i , j1, xmm2 );
1318 (~C).store( i+1UL, j , xmm3 );
1319 (~C).store( i+1UL, j1, xmm4 );
1320 (~C).store( i+2UL, j , xmm5 );
1321 (~C).store( i+2UL, j1, xmm6 );
1322 (~C).store( i+3UL, j , xmm7 );
1323 (~C).store( i+3UL, j1, xmm8 );
1326 for( ; (i+2UL) <= iend; i+=2UL )
1328 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1329 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1330 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1331 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1333 IntrinsicType xmm1( (~C).load(i ,j ) );
1334 IntrinsicType xmm2( (~C).load(i ,j1) );
1335 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
1336 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
1338 for(
size_t k=kbegin; k<kend; ++k ) {
1339 const IntrinsicType a1(
set( A(i ,k) ) );
1340 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1341 const IntrinsicType b1( B.load(k,j ) );
1342 const IntrinsicType b2( B.load(k,j1) );
1343 xmm1 = xmm1 + a1 * b1;
1344 xmm2 = xmm2 + a1 * b2;
1345 xmm3 = xmm3 + a2 * b1;
1346 xmm4 = xmm4 + a2 * b2;
1349 (~C).store( i , j , xmm1 );
1350 (~C).store( i , j1, xmm2 );
1351 (~C).store( i+1UL, j , xmm3 );
1352 (~C).store( i+1UL, j1, xmm4 );
1357 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1358 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1359 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1360 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
1362 IntrinsicType xmm1( (~C).load(i,j ) );
1363 IntrinsicType xmm2( (~C).load(i,j1) );
1365 for(
size_t k=kbegin; k<kend; ++k ) {
1366 const IntrinsicType a1(
set( A(i,k) ) );
1367 xmm1 = xmm1 + a1 * B.load(k,j );
1368 xmm2 = xmm2 + a1 * B.load(k,j1);
1371 (~C).store( i, j , xmm1 );
1372 (~C).store( i, j1, xmm2 );
1378 for(
size_t i=ii; i<iend; ++i )
1380 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1381 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1382 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1383 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
1385 IntrinsicType xmm1( (~C).load(i,j) );
1387 for(
size_t k=kbegin; k<kend; ++k ) {
1388 const IntrinsicType a1(
set( A(i,k) ) );
1389 xmm1 = xmm1 + a1 * B.load(k,j);
1392 (~C).store( i, j, xmm1 );
1396 for( ; remainder && j<jend; ++j )
1398 for(
size_t i=ii; i<iend; ++i )
1400 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1401 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1402 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1403 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
1405 ElementType value( (~C)(i,j) );
1407 for(
size_t k=kbegin; k<kend; ++k ) {
1408 value += A(i,k) * B(k,j);
1435 template<
typename MT3
1438 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1439 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1441 selectSmallAssignKernel( ~C, A, B );
1459 template<
typename MT3
1462 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1463 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1465 selectLargeAssignKernel( C, A, B );
1484 template<
typename MT3
1487 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
1488 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1492 if( IsTriangular<MT4>::value ) {
1494 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1496 else if( IsTriangular<MT5>::value ) {
1498 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
1501 gemm( C, A, B, ET(1), ET(0) );
1521 template<
typename MT
1523 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1528 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
1540 const TmpType tmp(
serial( rhs ) );
1541 assign( ~lhs, tmp );
1561 template<
typename MT >
1562 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1572 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
1573 assign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
1574 else if( IsSymmetric<MT1>::value )
1575 assign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
1577 assign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
1595 template<
typename MT
1597 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
1605 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1609 LT A(
serial( rhs.lhs_ ) );
1610 RT B(
serial( rhs.rhs_ ) );
1619 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1635 template<
typename MT3
1638 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1640 if( ( IsDiagonal<MT5>::value ) ||
1641 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1642 selectSmallAddAssignKernel( C, A, B );
1644 selectBlasAddAssignKernel( C, A, B );
1663 template<
typename MT3
1666 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
1667 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1669 const size_t M( A.rows() );
1670 const size_t N( B.columns() );
1671 const size_t K( A.columns() );
1673 for(
size_t i=0UL; i<M; ++i )
1675 const size_t kbegin( ( IsUpper<MT4>::value )
1676 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1678 const size_t kend( ( IsLower<MT4>::value )
1679 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1683 for(
size_t k=kbegin; k<kend; ++k )
1685 const size_t jbegin( ( IsUpper<MT5>::value )
1686 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
1688 const size_t jend( ( IsLower<MT5>::value )
1689 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
1693 const size_t jnum( jend - jbegin );
1694 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1696 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1697 C(i,j ) += A(i,k) * B(k,j );
1698 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1701 C(i,jpos) += A(i,k) * B(k,jpos);
1723 template<
typename MT3
1726 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
1727 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1731 const size_t M( A.rows() );
1732 const size_t N( B.columns() );
1734 for(
size_t i=0UL; i<M; ++i )
1736 const size_t jbegin( ( IsUpper<MT4>::value )
1737 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
1739 const size_t jend( ( IsLower<MT4>::value )
1740 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
1744 const size_t jnum( jend - jbegin );
1745 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1747 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1748 C(i,j ) += A(i,j ) * B(j ,j );
1749 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1752 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1773 template<
typename MT3
1776 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
1777 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1781 const size_t M( A.rows() );
1782 const size_t N( B.columns() );
1784 for(
size_t i=0UL; i<M; ++i )
1786 const size_t jbegin( ( IsUpper<MT5>::value )
1787 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
1789 const size_t jend( ( IsLower<MT5>::value )
1790 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
1794 const size_t jnum( jend - jbegin );
1795 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1797 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1798 C(i,j ) += A(i,i) * B(i,j );
1799 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
1802 C(i,jpos) += A(i,i) * B(i,jpos);
1823 template<
typename MT3
1826 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
1827 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1831 for(
size_t i=0UL; i<A.rows(); ++i ) {
1832 C(i,i) += A(i,i) * B(i,i);
1852 template<
typename MT3
1855 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1856 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1858 selectDefaultAddAssignKernel( C, A, B );
1878 template<
typename MT3
1881 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
1882 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1884 typedef IntrinsicTrait<ElementType> IT;
1886 const size_t M( A.rows() );
1887 const size_t N( B.columns() );
1888 const size_t K( A.columns() );
1890 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1892 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
1898 for(
size_t i=0UL; i<M; ++i )
1900 const size_t kbegin( ( IsUpper<MT4>::value )
1901 ?( ( IsLower<MT5>::value )
1902 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1903 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1904 :( IsLower<MT5>::value ? j : 0UL ) );
1905 const size_t kend( ( IsLower<MT4>::value )
1906 ?( ( IsUpper<MT5>::value )
1907 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
1908 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1909 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
1911 IntrinsicType xmm1( (~C).load(i,j ) );
1912 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
1913 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
1914 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
1915 IntrinsicType xmm5( (~C).load(i,j+
IT::size*4UL) );
1916 IntrinsicType xmm6( (~C).load(i,j+
IT::size*5UL) );
1917 IntrinsicType xmm7( (~C).load(i,j+
IT::size*6UL) );
1918 IntrinsicType xmm8( (~C).load(i,j+
IT::size*7UL) );
1920 for(
size_t k=kbegin; k<kend; ++k ) {
1921 const IntrinsicType a1(
set( A(i,k) ) );
1922 xmm1 = xmm1 + a1 * B.load(k,j );
1923 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
1924 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
1925 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
1926 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
1927 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
1928 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
1929 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
1932 (~C).store( i, j , xmm1 );
1933 (~C).store( i, j+
IT::size , xmm2 );
1934 (~C).store( i, j+
IT::size*2UL, xmm3 );
1935 (~C).store( i, j+
IT::size*3UL, xmm4 );
1936 (~C).store( i, j+
IT::size*4UL, xmm5 );
1937 (~C).store( i, j+
IT::size*5UL, xmm6 );
1938 (~C).store( i, j+
IT::size*6UL, xmm7 );
1939 (~C).store( i, j+
IT::size*7UL, xmm8 );
1947 for( ; (i+2UL) <= M; i+=2UL )
1949 const size_t kbegin( ( IsUpper<MT4>::value )
1950 ?( ( IsLower<MT5>::value )
1951 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1952 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1953 :( IsLower<MT5>::value ? j : 0UL ) );
1954 const size_t kend( ( IsLower<MT4>::value )
1955 ?( ( IsUpper<MT5>::value )
1956 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
1957 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1958 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
1960 IntrinsicType xmm1( (~C).load(i ,j ) );
1961 IntrinsicType xmm2( (~C).load(i ,j+
IT::size ) );
1962 IntrinsicType xmm3( (~C).load(i ,j+
IT::size*2UL) );
1963 IntrinsicType xmm4( (~C).load(i ,j+
IT::size*3UL) );
1964 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
1965 IntrinsicType xmm6( (~C).load(i+1UL,j+
IT::size ) );
1966 IntrinsicType xmm7( (~C).load(i+1UL,j+
IT::size*2UL) );
1967 IntrinsicType xmm8( (~C).load(i+1UL,j+
IT::size*3UL) );
1969 for(
size_t k=kbegin; k<kend; ++k ) {
1970 const IntrinsicType a1(
set( A(i ,k) ) );
1971 const IntrinsicType a2(
set( A(i+1UL,k) ) );
1972 const IntrinsicType b1( B.load(k,j ) );
1973 const IntrinsicType b2( B.load(k,j+
IT::size ) );
1974 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
1975 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
1976 xmm1 = xmm1 + a1 * b1;
1977 xmm2 = xmm2 + a1 * b2;
1978 xmm3 = xmm3 + a1 * b3;
1979 xmm4 = xmm4 + a1 * b4;
1980 xmm5 = xmm5 + a2 * b1;
1981 xmm6 = xmm6 + a2 * b2;
1982 xmm7 = xmm7 + a2 * b3;
1983 xmm8 = xmm8 + a2 * b4;
1986 (~C).store( i , j , xmm1 );
1987 (~C).store( i , j+
IT::size , xmm2 );
1988 (~C).store( i , j+
IT::size*2UL, xmm3 );
1989 (~C).store( i , j+
IT::size*3UL, xmm4 );
1990 (~C).store( i+1UL, j , xmm5 );
1991 (~C).store( i+1UL, j+
IT::size , xmm6 );
1992 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
1993 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
1998 const size_t kbegin( ( IsUpper<MT4>::value )
1999 ?( ( IsLower<MT5>::value )
2000 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2001 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2002 :( IsLower<MT5>::value ? j : 0UL ) );
2003 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
2005 IntrinsicType xmm1( (~C).load(i,j ) );
2006 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
2007 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
2008 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
2010 for(
size_t k=kbegin; k<kend; ++k ) {
2011 const IntrinsicType a1(
set( A(i,k) ) );
2012 xmm1 = xmm1 + a1 * B.load(k,j );
2013 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
2014 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
2015 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
2018 (~C).store( i, j , xmm1 );
2019 (~C).store( i, j+
IT::size , xmm2 );
2020 (~C).store( i, j+
IT::size*2UL, xmm3 );
2021 (~C).store( i, j+
IT::size*3UL, xmm4 );
2029 for( ; (i+2UL) <= M; i+=2UL )
2031 const size_t kbegin( ( IsUpper<MT4>::value )
2032 ?( ( IsLower<MT5>::value )
2033 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2034 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2035 :( IsLower<MT5>::value ? j : 0UL ) );
2036 const size_t kend( ( IsLower<MT4>::value )
2037 ?( ( IsUpper<MT5>::value )
2038 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
2039 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2040 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
2042 IntrinsicType xmm1( (~C).load(i ,j ) );
2043 IntrinsicType xmm2( (~C).load(i ,j+
IT::size) );
2044 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2045 IntrinsicType xmm4( (~C).load(i+1UL,j+
IT::size) );
2047 for(
size_t k=kbegin; k<kend; ++k ) {
2048 const IntrinsicType a1(
set( A(i ,k) ) );
2049 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2050 const IntrinsicType b1( B.load(k,j ) );
2051 const IntrinsicType b2( B.load(k,j+
IT::size) );
2052 xmm1 = xmm1 + a1 * b1;
2053 xmm2 = xmm2 + a1 * b2;
2054 xmm3 = xmm3 + a2 * b1;
2055 xmm4 = xmm4 + a2 * b2;
2058 (~C).store( i , j , xmm1 );
2059 (~C).store( i , j+
IT::size, xmm2 );
2060 (~C).store( i+1UL, j , xmm3 );
2061 (~C).store( i+1UL, j+
IT::size, xmm4 );
2066 const size_t kbegin( ( IsUpper<MT4>::value )
2067 ?( ( IsLower<MT5>::value )
2068 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2069 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2070 :( IsLower<MT5>::value ? j : 0UL ) );
2071 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
2073 IntrinsicType xmm1( (~C).load(i,j ) );
2074 IntrinsicType xmm2( (~C).load(i,j+
IT::size) );
2076 for(
size_t k=kbegin; k<kend; ++k ) {
2077 const IntrinsicType a1(
set( A(i,k) ) );
2078 xmm1 = xmm1 + a1 * B.load(k,j );
2079 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
2082 (~C).store( i, j , xmm1 );
2091 for( ; (i+2UL) <= M; i+=2UL )
2093 const size_t kbegin( ( IsUpper<MT4>::value )
2094 ?( ( IsLower<MT5>::value )
2095 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2096 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2097 :( IsLower<MT5>::value ? j : 0UL ) );
2098 const size_t kend( ( IsLower<MT4>::value )
2099 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2102 IntrinsicType xmm1( (~C).load(i ,j) );
2103 IntrinsicType xmm2( (~C).load(i+1UL,j) );
2105 for(
size_t k=kbegin; k<kend; ++k ) {
2106 const IntrinsicType b1( B.load(k,j) );
2107 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2108 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2111 (~C).store( i , j, xmm1 );
2112 (~C).store( i+1UL, j, xmm2 );
2117 const size_t kbegin( ( IsUpper<MT4>::value )
2118 ?( ( IsLower<MT5>::value )
2119 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2120 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2121 :( IsLower<MT5>::value ? j : 0UL ) );
2123 IntrinsicType xmm1( (~C).load(i,j) );
2125 for(
size_t k=kbegin; k<K; ++k ) {
2126 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2129 (~C).store( i, j, xmm1 );
2133 for( ; remainder && j<N; ++j )
2137 for( ; (i+2UL) <= M; i+=2UL )
2139 const size_t kbegin( ( IsUpper<MT4>::value )
2140 ?( ( IsLower<MT5>::value )
2141 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2142 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2143 :( IsLower<MT5>::value ? j : 0UL ) );
2144 const size_t kend( ( IsLower<MT4>::value )
2145 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2148 ElementType value1( (~C)(i ,j) );
2149 ElementType value2( (~C)(i+1UL,j) );;
2151 for(
size_t k=kbegin; k<kend; ++k ) {
2152 value1 += A(i ,k) * B(k,j);
2153 value2 += A(i+1UL,k) * B(k,j);
2156 (~C)(i ,j) = value1;
2157 (~C)(i+1UL,j) = value2;
2162 const size_t kbegin( ( IsUpper<MT4>::value )
2163 ?( ( IsLower<MT5>::value )
2164 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2165 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2166 :( IsLower<MT5>::value ? j : 0UL ) );
2168 ElementType value( (~C)(i,j) );
2170 for(
size_t k=kbegin; k<K; ++k ) {
2171 value += A(i,k) * B(k,j);
2196 template<
typename MT3
2199 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2200 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2207 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
2209 addAssign( ~C, tmp * B );
2211 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
2213 addAssign( ~C, A * tmp );
2215 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2217 addAssign( ~C, tmp * B );
2221 addAssign( ~C, A * tmp );
2241 template<
typename MT3
2244 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2245 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2247 selectDefaultAddAssignKernel( C, A, B );
2267 template<
typename MT3
2270 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2271 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2273 typedef IntrinsicTrait<ElementType> IT;
2275 const size_t M( A.rows() );
2276 const size_t N( B.columns() );
2277 const size_t K( A.columns() );
2279 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2281 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
2283 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
2285 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2288 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
2290 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
2292 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
2294 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
2306 for( ; (i+2UL) <= iend; i+=2UL )
2308 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2309 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2310 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2311 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
2313 IntrinsicType xmm1( (~C).load(i ,j ) );
2314 IntrinsicType xmm2( (~C).load(i ,j1) );
2315 IntrinsicType xmm3( (~C).load(i ,j2) );
2316 IntrinsicType xmm4( (~C).load(i ,j3) );
2317 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
2318 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
2319 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
2320 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
2322 for(
size_t k=kbegin; k<kend; ++k ) {
2323 const IntrinsicType a1(
set( A(i ,k) ) );
2324 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2325 const IntrinsicType b1( B.load(k,j ) );
2326 const IntrinsicType b2( B.load(k,j1) );
2327 const IntrinsicType b3( B.load(k,j2) );
2328 const IntrinsicType b4( B.load(k,j3) );
2329 xmm1 = xmm1 + a1 * b1;
2330 xmm2 = xmm2 + a1 * b2;
2331 xmm3 = xmm3 + a1 * b3;
2332 xmm4 = xmm4 + a1 * b4;
2333 xmm5 = xmm5 + a2 * b1;
2334 xmm6 = xmm6 + a2 * b2;
2335 xmm7 = xmm7 + a2 * b3;
2336 xmm8 = xmm8 + a2 * b4;
2339 (~C).store( i , j , xmm1 );
2340 (~C).store( i , j1, xmm2 );
2341 (~C).store( i , j2, xmm3 );
2342 (~C).store( i , j3, xmm4 );
2343 (~C).store( i+1UL, j , xmm5 );
2344 (~C).store( i+1UL, j1, xmm6 );
2345 (~C).store( i+1UL, j2, xmm7 );
2346 (~C).store( i+1UL, j3, xmm8 );
2351 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2352 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2353 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2354 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
2356 IntrinsicType xmm1( (~C).load(i,j ) );
2357 IntrinsicType xmm2( (~C).load(i,j1) );
2358 IntrinsicType xmm3( (~C).load(i,j2) );
2359 IntrinsicType xmm4( (~C).load(i,j3) );
2361 for(
size_t k=kbegin; k<kend; ++k ) {
2362 const IntrinsicType a1(
set( A(i,k) ) );
2363 xmm1 = xmm1 + a1 * B.load(k,j );
2364 xmm2 = xmm2 + a1 * B.load(k,j1);
2365 xmm3 = xmm3 + a1 * B.load(k,j2);
2366 xmm4 = xmm4 + a1 * B.load(k,j3);
2369 (~C).store( i, j , xmm1 );
2370 (~C).store( i, j1, xmm2 );
2371 (~C).store( i, j2, xmm3 );
2372 (~C).store( i, j3, xmm4 );
2382 for( ; (i+4UL) <= iend; i+=4UL )
2384 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2385 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2386 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
2387 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
2389 IntrinsicType xmm1( (~C).load(i ,j ) );
2390 IntrinsicType xmm2( (~C).load(i ,j1) );
2391 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2392 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
2393 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
2394 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
2395 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
2396 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
2398 for(
size_t k=kbegin; k<kend; ++k ) {
2399 const IntrinsicType a1(
set( A(i ,k) ) );
2400 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2401 const IntrinsicType a3(
set( A(i+2UL,k) ) );
2402 const IntrinsicType a4(
set( A(i+3UL,k) ) );
2403 const IntrinsicType b1( B.load(k,j ) );
2404 const IntrinsicType b2( B.load(k,j1) );
2405 xmm1 = xmm1 + a1 * b1;
2406 xmm2 = xmm2 + a1 * b2;
2407 xmm3 = xmm3 + a2 * b1;
2408 xmm4 = xmm4 + a2 * b2;
2409 xmm5 = xmm5 + a3 * b1;
2410 xmm6 = xmm6 + a3 * b2;
2411 xmm7 = xmm7 + a4 * b1;
2412 xmm8 = xmm8 + a4 * b2;
2415 (~C).store( i , j , xmm1 );
2416 (~C).store( i , j1, xmm2 );
2417 (~C).store( i+1UL, j , xmm3 );
2418 (~C).store( i+1UL, j1, xmm4 );
2419 (~C).store( i+2UL, j , xmm5 );
2420 (~C).store( i+2UL, j1, xmm6 );
2421 (~C).store( i+3UL, j , xmm7 );
2422 (~C).store( i+3UL, j1, xmm8 );
2425 for( ; (i+2UL) <= iend; i+=2UL )
2427 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2428 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2429 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
2430 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
2432 IntrinsicType xmm1( (~C).load(i ,j ) );
2433 IntrinsicType xmm2( (~C).load(i ,j1) );
2434 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
2435 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
2437 for(
size_t k=kbegin; k<kend; ++k ) {
2438 const IntrinsicType a1(
set( A(i ,k) ) );
2439 const IntrinsicType a2(
set( A(i+1UL,k) ) );
2440 const IntrinsicType b1( B.load(k,j ) );
2441 const IntrinsicType b2( B.load(k,j1) );
2442 xmm1 = xmm1 + a1 * b1;
2443 xmm2 = xmm2 + a1 * b2;
2444 xmm3 = xmm3 + a2 * b1;
2445 xmm4 = xmm4 + a2 * b2;
2448 (~C).store( i , j , xmm1 );
2449 (~C).store( i , j1, xmm2 );
2450 (~C).store( i+1UL, j , xmm3 );
2451 (~C).store( i+1UL, j1, xmm4 );
2456 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2457 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2458 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2459 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
2461 IntrinsicType xmm1( (~C).load(i,j ) );
2462 IntrinsicType xmm2( (~C).load(i,j1) );
2464 for(
size_t k=kbegin; k<kend; ++k ) {
2465 const IntrinsicType a1(
set( A(i,k) ) );
2466 xmm1 = xmm1 + a1 * B.load(k,j );
2467 xmm2 = xmm2 + a1 * B.load(k,j1);
2470 (~C).store( i, j , xmm1 );
2471 (~C).store( i, j1, xmm2 );
2477 for(
size_t i=ii; i<iend; ++i )
2479 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2480 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2481 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2482 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
2484 IntrinsicType xmm1( (~C).load(i,j) );
2486 for(
size_t k=kbegin; k<kend; ++k ) {
2487 const IntrinsicType a1(
set( A(i,k) ) );
2488 xmm1 = xmm1 + a1 * B.load(k,j);
2491 (~C).store( i, j, xmm1 );
2495 for( ; remainder && j<jend; ++j )
2497 for(
size_t i=ii; i<iend; ++i )
2499 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2500 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2501 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
2502 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
2504 ElementType value( (~C)(i,j) );
2506 for(
size_t k=kbegin; k<kend; ++k ) {
2507 value += A(i,k) * B(k,j);
2534 template<
typename MT3
2537 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2538 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2540 selectSmallAddAssignKernel( ~C, A, B );
2559 template<
typename MT3
2562 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2563 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2565 selectLargeAddAssignKernel( C, A, B );
2585 template<
typename MT3
2588 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
2589 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2593 if( IsTriangular<MT4>::value ) {
2595 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2596 addAssign( C, tmp );
2598 else if( IsTriangular<MT5>::value ) {
2600 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2601 addAssign( C, tmp );
2604 gemm( C, A, B, ET(1), ET(1) );
2626 template<
typename MT >
2627 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2637 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
2638 addAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
2639 else if( IsSymmetric<MT1>::value )
2640 addAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
2642 addAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
2664 template<
typename MT
2666 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
2674 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2678 LT A(
serial( rhs.lhs_ ) );
2679 RT B(
serial( rhs.rhs_ ) );
2688 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2704 template<
typename MT3
2707 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2709 if( ( IsDiagonal<MT5>::value ) ||
2710 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2711 selectSmallSubAssignKernel( C, A, B );
2713 selectBlasSubAssignKernel( C, A, B );
2732 template<
typename MT3
2735 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
2736 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2738 const size_t M( A.rows() );
2739 const size_t N( B.columns() );
2740 const size_t K( A.columns() );
2742 for(
size_t i=0UL; i<M; ++i )
2744 const size_t kbegin( ( IsUpper<MT4>::value )
2745 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2747 const size_t kend( ( IsLower<MT4>::value )
2748 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2752 for(
size_t k=kbegin; k<kend; ++k )
2754 const size_t jbegin( ( IsUpper<MT5>::value )
2755 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2757 const size_t jend( ( IsLower<MT5>::value )
2758 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2762 const size_t jnum( jend - jbegin );
2763 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2765 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2766 C(i,j ) -= A(i,k) * B(k,j );
2767 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
2770 C(i,jpos) -= A(i,k) * B(k,jpos);
2792 template<
typename MT3
2795 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
2796 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2800 const size_t M( A.rows() );
2801 const size_t N( B.columns() );
2803 for(
size_t i=0UL; i<M; ++i )
2805 const size_t jbegin( ( IsUpper<MT4>::value )
2806 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2808 const size_t jend( ( IsLower<MT4>::value )
2809 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2813 const size_t jnum( jend - jbegin );
2814 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2816 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2817 C(i,j ) -= A(i,j ) * B(j ,j );
2818 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
2821 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
2842 template<
typename MT3
2845 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
2846 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2850 const size_t M( A.rows() );
2851 const size_t N( B.columns() );
2853 for(
size_t i=0UL; i<M; ++i )
2855 const size_t jbegin( ( IsUpper<MT5>::value )
2856 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2858 const size_t jend( ( IsLower<MT5>::value )
2859 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2863 const size_t jnum( jend - jbegin );
2864 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2866 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2867 C(i,j ) -= A(i,i) * B(i,j );
2868 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
2871 C(i,jpos) -= A(i,i) * B(i,jpos);
2892 template<
typename MT3
2895 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
2896 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2900 for(
size_t i=0UL; i<A.rows(); ++i ) {
2901 C(i,i) -= A(i,i) * B(i,i);
2921 template<
typename MT3
2924 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2925 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2927 selectDefaultSubAssignKernel( C, A, B );
2947 template<
typename MT3
2950 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
2951 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2953 typedef IntrinsicTrait<ElementType> IT;
2955 const size_t M( A.rows() );
2956 const size_t N( B.columns() );
2957 const size_t K( A.columns() );
2959 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2961 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
2967 for(
size_t i=0UL; i<M; ++i )
2969 const size_t kbegin( ( IsUpper<MT4>::value )
2970 ?( ( IsLower<MT5>::value )
2971 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2972 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2973 :( IsLower<MT5>::value ? j : 0UL ) );
2974 const size_t kend( ( IsLower<MT4>::value )
2975 ?( ( IsUpper<MT5>::value )
2976 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
2977 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2978 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
2980 IntrinsicType xmm1( (~C).load(i,j ) );
2981 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
2982 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
2983 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
2984 IntrinsicType xmm5( (~C).load(i,j+
IT::size*4UL) );
2985 IntrinsicType xmm6( (~C).load(i,j+
IT::size*5UL) );
2986 IntrinsicType xmm7( (~C).load(i,j+
IT::size*6UL) );
2987 IntrinsicType xmm8( (~C).load(i,j+
IT::size*7UL) );
2989 for(
size_t k=kbegin; k<kend; ++k ) {
2990 const IntrinsicType a1(
set( A(i,k) ) );
2991 xmm1 = xmm1 - a1 * B.load(k,j );
2992 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
2993 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
2994 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
2995 xmm5 = xmm5 - a1 * B.load(k,j+
IT::size*4UL);
2996 xmm6 = xmm6 - a1 * B.load(k,j+
IT::size*5UL);
2997 xmm7 = xmm7 - a1 * B.load(k,j+
IT::size*6UL);
2998 xmm8 = xmm8 - a1 * B.load(k,j+
IT::size*7UL);
3001 (~C).store( i, j , xmm1 );
3002 (~C).store( i, j+
IT::size , xmm2 );
3003 (~C).store( i, j+
IT::size*2UL, xmm3 );
3004 (~C).store( i, j+
IT::size*3UL, xmm4 );
3005 (~C).store( i, j+
IT::size*4UL, xmm5 );
3006 (~C).store( i, j+
IT::size*5UL, xmm6 );
3007 (~C).store( i, j+
IT::size*6UL, xmm7 );
3008 (~C).store( i, j+
IT::size*7UL, xmm8 );
3016 for( ; (i+2UL) <= M; i+=2UL )
3018 const size_t kbegin( ( IsUpper<MT4>::value )
3019 ?( ( IsLower<MT5>::value )
3020 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3021 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3022 :( IsLower<MT5>::value ? j : 0UL ) );
3023 const size_t kend( ( IsLower<MT4>::value )
3024 ?( ( IsUpper<MT5>::value )
3025 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
3026 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3027 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
3029 IntrinsicType xmm1( (~C).load(i ,j ) );
3030 IntrinsicType xmm2( (~C).load(i ,j+
IT::size ) );
3031 IntrinsicType xmm3( (~C).load(i ,j+
IT::size*2UL) );
3032 IntrinsicType xmm4( (~C).load(i ,j+
IT::size*3UL) );
3033 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3034 IntrinsicType xmm6( (~C).load(i+1UL,j+
IT::size ) );
3035 IntrinsicType xmm7( (~C).load(i+1UL,j+
IT::size*2UL) );
3036 IntrinsicType xmm8( (~C).load(i+1UL,j+
IT::size*3UL) );
3038 for(
size_t k=kbegin; k<kend; ++k ) {
3039 const IntrinsicType a1(
set( A(i ,k) ) );
3040 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3041 const IntrinsicType b1( B.load(k,j ) );
3042 const IntrinsicType b2( B.load(k,j+
IT::size ) );
3043 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
3044 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
3045 xmm1 = xmm1 - a1 * b1;
3046 xmm2 = xmm2 - a1 * b2;
3047 xmm3 = xmm3 - a1 * b3;
3048 xmm4 = xmm4 - a1 * b4;
3049 xmm5 = xmm5 - a2 * b1;
3050 xmm6 = xmm6 - a2 * b2;
3051 xmm7 = xmm7 - a2 * b3;
3052 xmm8 = xmm8 - a2 * b4;
3055 (~C).store( i , j , xmm1 );
3056 (~C).store( i , j+
IT::size , xmm2 );
3057 (~C).store( i , j+
IT::size*2UL, xmm3 );
3058 (~C).store( i , j+
IT::size*3UL, xmm4 );
3059 (~C).store( i+1UL, j , xmm5 );
3060 (~C).store( i+1UL, j+
IT::size , xmm6 );
3061 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 );
3062 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 );
3067 const size_t kbegin( ( IsUpper<MT4>::value )
3068 ?( ( IsLower<MT5>::value )
3069 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3070 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3071 :( IsLower<MT5>::value ? j : 0UL ) );
3072 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
3074 IntrinsicType xmm1( (~C).load(i,j ) );
3075 IntrinsicType xmm2( (~C).load(i,j+
IT::size ) );
3076 IntrinsicType xmm3( (~C).load(i,j+
IT::size*2UL) );
3077 IntrinsicType xmm4( (~C).load(i,j+
IT::size*3UL) );
3079 for(
size_t k=kbegin; k<kend; ++k ) {
3080 const IntrinsicType a1(
set( A(i,k) ) );
3081 xmm1 = xmm1 - a1 * B.load(k,j );
3082 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size );
3083 xmm3 = xmm3 - a1 * B.load(k,j+
IT::size*2UL);
3084 xmm4 = xmm4 - a1 * B.load(k,j+
IT::size*3UL);
3087 (~C).store( i, j , xmm1 );
3088 (~C).store( i, j+
IT::size , xmm2 );
3089 (~C).store( i, j+
IT::size*2UL, xmm3 );
3090 (~C).store( i, j+
IT::size*3UL, xmm4 );
3098 for( ; (i+2UL) <= M; i+=2UL )
3100 const size_t kbegin( ( IsUpper<MT4>::value )
3101 ?( ( IsLower<MT5>::value )
3102 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3103 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3104 :( IsLower<MT5>::value ? j : 0UL ) );
3105 const size_t kend( ( IsLower<MT4>::value )
3106 ?( ( IsUpper<MT5>::value )
3107 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
3108 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
3109 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
3111 IntrinsicType xmm1( (~C).load(i ,j ) );
3112 IntrinsicType xmm2( (~C).load(i ,j+
IT::size) );
3113 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3114 IntrinsicType xmm4( (~C).load(i+1UL,j+
IT::size) );
3116 for(
size_t k=kbegin; k<kend; ++k ) {
3117 const IntrinsicType a1(
set( A(i ,k) ) );
3118 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3119 const IntrinsicType b1( B.load(k,j ) );
3120 const IntrinsicType b2( B.load(k,j+
IT::size) );
3121 xmm1 = xmm1 - a1 * b1;
3122 xmm2 = xmm2 - a1 * b2;
3123 xmm3 = xmm3 - a2 * b1;
3124 xmm4 = xmm4 - a2 * b2;
3127 (~C).store( i , j , xmm1 );
3128 (~C).store( i , j+
IT::size, xmm2 );
3129 (~C).store( i+1UL, j , xmm3 );
3130 (~C).store( i+1UL, j+
IT::size, xmm4 );
3135 const size_t kbegin( ( IsUpper<MT4>::value )
3136 ?( ( IsLower<MT5>::value )
3137 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3138 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3139 :( IsLower<MT5>::value ? j : 0UL ) );
3140 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
3142 IntrinsicType xmm1( (~C).load(i,j ) );
3143 IntrinsicType xmm2( (~C).load(i,j+
IT::size) );
3145 for(
size_t k=kbegin; k<kend; ++k ) {
3146 const IntrinsicType a1(
set( A(i,k) ) );
3147 xmm1 = xmm1 - a1 * B.load(k,j );
3148 xmm2 = xmm2 - a1 * B.load(k,j+
IT::size);
3151 (~C).store( i, j , xmm1 );
3160 for( ; (i+2UL) <= M; i+=2UL )
3162 const size_t kbegin( ( IsUpper<MT4>::value )
3163 ?( ( IsLower<MT5>::value )
3164 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3165 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3166 :( IsLower<MT5>::value ? j : 0UL ) );
3167 const size_t kend( ( IsLower<MT4>::value )
3168 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3171 IntrinsicType xmm1( (~C).load(i ,j) );
3172 IntrinsicType xmm2( (~C).load(i+1UL,j) );
3174 for(
size_t k=kbegin; k<kend; ++k ) {
3175 const IntrinsicType b1( B.load(k,j) );
3176 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
3177 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
3180 (~C).store( i , j, xmm1 );
3181 (~C).store( i+1UL, j, xmm2 );
3186 const size_t kbegin( ( IsUpper<MT4>::value )
3187 ?( ( IsLower<MT5>::value )
3188 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3189 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3190 :( IsLower<MT5>::value ? j : 0UL ) );
3192 IntrinsicType xmm1( (~C).load(i,j) );
3194 for(
size_t k=kbegin; k<K; ++k ) {
3195 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
3198 (~C).store( i, j, xmm1 );
3202 for( ; remainder && j<N; ++j )
3206 for( ; (i+2UL) <= M; i+=2UL )
3208 const size_t kbegin( ( IsUpper<MT4>::value )
3209 ?( ( IsLower<MT5>::value )
3210 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3211 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3212 :( IsLower<MT5>::value ? j : 0UL ) );
3213 const size_t kend( ( IsLower<MT4>::value )
3214 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
3217 ElementType value1( (~C)(i ,j) );
3218 ElementType value2( (~C)(i+1UL,j) );
3220 for(
size_t k=kbegin; k<kend; ++k ) {
3221 value1 -= A(i ,k) * B(k,j);
3222 value2 -= A(i+1UL,k) * B(k,j);
3225 (~C)(i ,j) = value1;
3226 (~C)(i+1UL,j) = value2;
3231 const size_t kbegin( ( IsUpper<MT4>::value )
3232 ?( ( IsLower<MT5>::value )
3233 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
3234 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
3235 :( IsLower<MT5>::value ? j : 0UL ) );
3237 ElementType value( (~C)(i,j) );
3239 for(
size_t k=kbegin; k<K; ++k ) {
3240 value -= A(i,k) * B(k,j);
3265 template<
typename MT3
3268 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3269 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3276 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
3278 subAssign( ~C, tmp * B );
3280 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
3282 subAssign( ~C, A * tmp );
3284 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3286 subAssign( ~C, tmp * B );
3290 subAssign( ~C, A * tmp );
3310 template<
typename MT3
3313 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3314 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3316 selectDefaultSubAssignKernel( C, A, B );
3336 template<
typename MT3
3339 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3340 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3342 typedef IntrinsicTrait<ElementType> IT;
3344 const size_t M( A.rows() );
3345 const size_t N( B.columns() );
3346 const size_t K( A.columns() );
3348 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3350 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
3352 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
3354 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3357 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
3359 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
3361 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
3363 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
3375 for( ; (i+2UL) <= iend; i+=2UL )
3377 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3378 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3379 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3380 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3382 IntrinsicType xmm1( (~C).load(i ,j ) );
3383 IntrinsicType xmm2( (~C).load(i ,j1) );
3384 IntrinsicType xmm3( (~C).load(i ,j2) );
3385 IntrinsicType xmm4( (~C).load(i ,j3) );
3386 IntrinsicType xmm5( (~C).load(i+1UL,j ) );
3387 IntrinsicType xmm6( (~C).load(i+1UL,j1) );
3388 IntrinsicType xmm7( (~C).load(i+1UL,j2) );
3389 IntrinsicType xmm8( (~C).load(i+1UL,j3) );
3391 for(
size_t k=kbegin; k<kend; ++k ) {
3392 const IntrinsicType a1(
set( A(i ,k) ) );
3393 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3394 const IntrinsicType b1( B.load(k,j ) );
3395 const IntrinsicType b2( B.load(k,j1) );
3396 const IntrinsicType b3( B.load(k,j2) );
3397 const IntrinsicType b4( B.load(k,j3) );
3398 xmm1 = xmm1 - a1 * b1;
3399 xmm2 = xmm2 - a1 * b2;
3400 xmm3 = xmm3 - a1 * b3;
3401 xmm4 = xmm4 - a1 * b4;
3402 xmm5 = xmm5 - a2 * b1;
3403 xmm6 = xmm6 - a2 * b2;
3404 xmm7 = xmm7 - a2 * b3;
3405 xmm8 = xmm8 - a2 * b4;
3408 (~C).store( i , j , xmm1 );
3409 (~C).store( i , j1, xmm2 );
3410 (~C).store( i , j2, xmm3 );
3411 (~C).store( i , j3, xmm4 );
3412 (~C).store( i+1UL, j , xmm5 );
3413 (~C).store( i+1UL, j1, xmm6 );
3414 (~C).store( i+1UL, j2, xmm7 );
3415 (~C).store( i+1UL, j3, xmm8 );
3420 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3421 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3422 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3423 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
3425 IntrinsicType xmm1( (~C).load(i,j ) );
3426 IntrinsicType xmm2( (~C).load(i,j1) );
3427 IntrinsicType xmm3( (~C).load(i,j2) );
3428 IntrinsicType xmm4( (~C).load(i,j3) );
3430 for(
size_t k=kbegin; k<kend; ++k ) {
3431 const IntrinsicType a1(
set( A(i,k) ) );
3432 xmm1 = xmm1 - a1 * B.load(k,j );
3433 xmm2 = xmm2 - a1 * B.load(k,j1);
3434 xmm3 = xmm3 - a1 * B.load(k,j2);
3435 xmm4 = xmm4 - a1 * B.load(k,j3);
3438 (~C).store( i, j , xmm1 );
3439 (~C).store( i, j1, xmm2 );
3440 (~C).store( i, j2, xmm3 );
3441 (~C).store( i, j3, xmm4 );
3451 for( ; (i+4UL) <= iend; i+=4UL )
3453 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3454 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3455 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3456 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3458 IntrinsicType xmm1( (~C).load(i ,j ) );
3459 IntrinsicType xmm2( (~C).load(i ,j1) );
3460 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3461 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3462 IntrinsicType xmm5( (~C).load(i+2UL,j ) );
3463 IntrinsicType xmm6( (~C).load(i+2UL,j1) );
3464 IntrinsicType xmm7( (~C).load(i+3UL,j ) );
3465 IntrinsicType xmm8( (~C).load(i+3UL,j1) );
3467 for(
size_t k=kbegin; k<kend; ++k ) {
3468 const IntrinsicType a1(
set( A(i ,k) ) );
3469 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3470 const IntrinsicType a3(
set( A(i+2UL,k) ) );
3471 const IntrinsicType a4(
set( A(i+3UL,k) ) );
3472 const IntrinsicType b1( B.load(k,j ) );
3473 const IntrinsicType b2( B.load(k,j1) );
3474 xmm1 = xmm1 - a1 * b1;
3475 xmm2 = xmm2 - a1 * b2;
3476 xmm3 = xmm3 - a2 * b1;
3477 xmm4 = xmm4 - a2 * b2;
3478 xmm5 = xmm5 - a3 * b1;
3479 xmm6 = xmm6 - a3 * b2;
3480 xmm7 = xmm7 - a4 * b1;
3481 xmm8 = xmm8 - a4 * b2;
3484 (~C).store( i , j , xmm1 );
3485 (~C).store( i , j1, xmm2 );
3486 (~C).store( i+1UL, j , xmm3 );
3487 (~C).store( i+1UL, j1, xmm4 );
3488 (~C).store( i+2UL, j , xmm5 );
3489 (~C).store( i+2UL, j1, xmm6 );
3490 (~C).store( i+3UL, j , xmm7 );
3491 (~C).store( i+3UL, j1, xmm8 );
3494 for( ; (i+2UL) <= iend; i+=2UL )
3496 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3497 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3498 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3499 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3501 IntrinsicType xmm1( (~C).load(i ,j ) );
3502 IntrinsicType xmm2( (~C).load(i ,j1) );
3503 IntrinsicType xmm3( (~C).load(i+1UL,j ) );
3504 IntrinsicType xmm4( (~C).load(i+1UL,j1) );
3506 for(
size_t k=kbegin; k<kend; ++k ) {
3507 const IntrinsicType a1(
set( A(i ,k) ) );
3508 const IntrinsicType a2(
set( A(i+1UL,k) ) );
3509 const IntrinsicType b1( B.load(k,j ) );
3510 const IntrinsicType b2( B.load(k,j1) );
3511 xmm1 = xmm1 - a1 * b1;
3512 xmm2 = xmm2 - a1 * b2;
3513 xmm3 = xmm3 - a2 * b1;
3514 xmm4 = xmm4 - a2 * b2;
3517 (~C).store( i , j , xmm1 );
3518 (~C).store( i , j1, xmm2 );
3519 (~C).store( i+1UL, j , xmm3 );
3520 (~C).store( i+1UL, j1, xmm4 );
3525 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3526 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3527 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3528 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
3530 IntrinsicType xmm1( (~C).load(i,j ) );
3531 IntrinsicType xmm2( (~C).load(i,j1) );
3533 for(
size_t k=kbegin; k<kend; ++k ) {
3534 const IntrinsicType a1(
set( A(i,k) ) );
3535 xmm1 = xmm1 - a1 * B.load(k,j );
3536 xmm2 = xmm2 - a1 * B.load(k,j1);
3539 (~C).store( i, j , xmm1 );
3540 (~C).store( i, j1, xmm2 );
3546 for(
size_t i=ii; i<iend; ++i )
3548 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3549 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3550 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3551 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
3553 IntrinsicType xmm1( (~C).load(i,j) );
3555 for(
size_t k=kbegin; k<kend; ++k ) {
3556 const IntrinsicType a1(
set( A(i,k) ) );
3557 xmm1 = xmm1 - a1 * B.load(k,j);
3560 (~C).store( i, j, xmm1 );
3564 for( ; remainder && j<jend; ++j )
3566 for(
size_t i=ii; i<iend; ++i )
3568 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3569 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3570 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3571 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
3573 ElementType value( (~C)(i,j) );
3575 for(
size_t k=kbegin; k<kend; ++k ) {
3576 value -= A(i,k) * B(k,j);
3603 template<
typename MT3
3606 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5> >::Type
3607 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3609 selectSmallSubAssignKernel( ~C, A, B );
3628 template<
typename MT3
3631 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3632 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3634 selectLargeSubAssignKernel( C, A, B );
3654 template<
typename MT3
3657 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5> >::Type
3658 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3662 if( IsTriangular<MT4>::value ) {
3664 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3665 subAssign( C, tmp );
3667 else if( IsTriangular<MT5>::value ) {
3669 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3670 subAssign( C, tmp );
3673 gemm( C, A, B, ET(-1), ET(1) );
3695 template<
typename MT >
3696 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3706 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3707 subAssign( ~lhs,
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) );
3708 else if( IsSymmetric<MT1>::value )
3709 subAssign( ~lhs,
trans( rhs.lhs_ ) * rhs.rhs_ );
3711 subAssign( ~lhs, rhs.lhs_ *
trans( rhs.rhs_ ) );
3743 template<
typename MT
3745 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3753 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
3756 else if( rhs.lhs_.columns() == 0UL ) {
3791 template<
typename MT
3793 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3798 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
3810 const TmpType tmp( rhs );
3831 template<
typename MT >
3832 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3842 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3844 else if( IsSymmetric<MT1>::value )
3868 template<
typename MT
3870 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3878 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3912 template<
typename MT >
3913 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
3923 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
3925 else if( IsSymmetric<MT1>::value )
3953 template<
typename MT
3955 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
3963 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3997 template<
typename MT >
3998 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4008 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
4010 else if( IsSymmetric<MT1>::value )
4059 template<
typename MT1
4063 :
public DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2>, ST, false >, false >
4064 ,
private MatScalarMultExpr
4065 ,
private Computation
4069 typedef DMatDMatMultExpr<MT1,MT2> MMM;
4081 enum { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
4086 enum { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
4096 template<
typename T1,
typename T2,
typename T3 >
4097 struct CanExploitSymmetry {
4098 enum { value = IsColumnMajorMatrix<T1>::value &&
4099 ( IsSymmetric<T2>::value || IsSymmetric<T3>::value ) };
4108 template<
typename T1,
typename T2,
typename T3 >
4109 struct IsEvaluationRequired {
4110 enum { value = ( evaluateLeft || evaluateRight ) &&
4111 !CanExploitSymmetry<T1,T2,T3>::value };
4119 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4120 struct UseBlasKernel {
4122 HasMutableDataAccess<T1>::value &&
4123 HasConstDataAccess<T2>::value &&
4124 HasConstDataAccess<T3>::value &&
4125 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
4126 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4127 IsBlasCompatible<typename T1::ElementType>::value &&
4128 IsBlasCompatible<typename T2::ElementType>::value &&
4129 IsBlasCompatible<typename T3::ElementType>::value &&
4130 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
4131 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
4132 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
4140 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4141 struct UseVectorizedDefaultKernel {
4143 !IsDiagonal<T3>::value &&
4144 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
4145 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
4146 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
4147 IsSame<typename T1::ElementType,T4>::value &&
4148 IntrinsicTrait<typename T1::ElementType>::addition &&
4149 IntrinsicTrait<typename T1::ElementType>::subtraction &&
4150 IntrinsicTrait<typename T1::ElementType>::multiplication };
4156 typedef DMatScalarMultExpr<MMM,ST,false>
This;
4157 typedef typename MultTrait<RES,ST>::Type
ResultType;
4161 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
4166 typedef const DMatDMatMultExpr<MT1,MT2>
LeftOperand;
4172 typedef typename SelectType< evaluateLeft, const RT1, CT1 >::Type
LT;
4175 typedef typename SelectType< evaluateRight, const RT2, CT2 >::Type
RT;
4180 enum { vectorizable = !IsDiagonal<MT2>::value &&
4181 MT1::vectorizable && MT2::vectorizable &&
4182 IsSame<ET1,ET2>::value &&
4183 IsSame<ET1,ST>::value &&
4184 IntrinsicTrait<ET1>::addition &&
4185 IntrinsicTrait<ET1>::multiplication };
4188 enum { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
4189 !evaluateRight && MT2::smpAssignable };
4198 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
4211 inline ReturnType
operator()(
size_t i,
size_t j )
const {
4214 return matrix_(i,j) * scalar_;
4226 inline ReturnType
at(
size_t i,
size_t j )
const {
4227 if( i >= matrix_.rows() ) {
4230 if( j >= matrix_.columns() ) {
4233 return (*
this)(i,j);
4242 inline size_t rows()
const {
4243 return matrix_.rows();
4252 inline size_t columns()
const {
4253 return matrix_.columns();
4283 template<
typename T >
4284 inline bool canAlias(
const T* alias )
const {
4285 return matrix_.canAlias( alias );
4295 template<
typename T >
4296 inline bool isAliased(
const T* alias )
const {
4297 return matrix_.isAliased( alias );
4307 return matrix_.isAligned();
4317 typename MMM::LeftOperand A( matrix_.leftOperand() );
4319 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
4320 ( A.rows() > SMP_DMATDMATMULT_THRESHOLD );
4326 LeftOperand matrix_;
4327 RightOperand scalar_;
4342 template<
typename MT
4344 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
4345 assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
4352 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
4353 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
4355 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4358 else if( left.columns() == 0UL ) {
4373 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4388 template<
typename MT3
4392 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4394 if( ( IsDiagonal<MT5>::value ) ||
4395 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
4396 selectSmallAssignKernel( C, A, B, scalar );
4398 selectBlasAssignKernel( C, A, B, scalar );
4416 template<
typename MT3
4420 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
4421 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4423 const size_t M( A.rows() );
4424 const size_t N( B.columns() );
4425 const size_t K( A.columns() );
4427 for(
size_t i=0UL; i<M; ++i )
4429 const size_t kbegin( ( IsUpper<MT4>::value )
4430 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4432 const size_t kend( ( IsLower<MT4>::value )
4433 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4437 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
4438 for(
size_t j=0UL; j<N; ++j ) {
4445 const size_t jbegin( ( IsUpper<MT5>::value )
4446 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
4448 const size_t jend( ( IsLower<MT5>::value )
4449 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
4453 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
4454 for(
size_t j=0UL; j<jbegin; ++j ) {
4458 else if( IsStrictlyUpper<MT5>::value ) {
4461 for(
size_t j=jbegin; j<jend; ++j ) {
4462 C(i,j) = A(i,kbegin) * B(kbegin,j);
4464 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
4465 for(
size_t j=jend; j<N; ++j ) {
4469 else if( IsStrictlyLower<MT5>::value ) {
4470 reset( C(i,N-1UL) );
4474 for(
size_t k=kbegin+1UL; k<kend; ++k )
4476 const size_t jbegin( ( IsUpper<MT5>::value )
4477 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4479 const size_t jend( ( IsLower<MT5>::value )
4480 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
4484 for(
size_t j=jbegin; j<jend; ++j ) {
4485 C(i,j) += A(i,k) * B(k,j);
4487 if( IsLower<MT5>::value ) {
4488 C(i,jend) = A(i,k) * B(k,jend);
4493 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
4494 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
4496 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
4497 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
4501 for(
size_t j=jbegin; j<jend; ++j ) {
4523 template<
typename MT3
4527 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
4528 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4532 const size_t M( A.rows() );
4533 const size_t N( B.columns() );
4535 for(
size_t i=0UL; i<M; ++i )
4537 const size_t jbegin( ( IsUpper<MT4>::value )
4538 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4540 const size_t jend( ( IsLower<MT4>::value )
4541 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4545 if( IsUpper<MT4>::value ) {
4546 for(
size_t j=0UL; j<jbegin; ++j ) {
4550 for(
size_t j=jbegin; j<jend; ++j ) {
4551 C(i,j) = A(i,j) * B(j,j) * scalar;
4553 if( IsLower<MT4>::value ) {
4554 for(
size_t j=jend; j<N; ++j ) {
4576 template<
typename MT3
4580 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
4581 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4585 const size_t M( A.rows() );
4586 const size_t N( B.columns() );
4588 for(
size_t i=0UL; i<M; ++i )
4590 const size_t jbegin( ( IsUpper<MT5>::value )
4591 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4593 const size_t jend( ( IsLower<MT5>::value )
4594 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4598 if( IsUpper<MT5>::value ) {
4599 for(
size_t j=0UL; j<jbegin; ++j ) {
4603 for(
size_t j=jbegin; j<jend; ++j ) {
4604 C(i,j) = A(i,i) * B(i,j) * scalar;
4606 if( IsLower<MT5>::value ) {
4607 for(
size_t j=jend; j<N; ++j ) {
4629 template<
typename MT3
4633 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
4634 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4640 for(
size_t i=0UL; i<A.rows(); ++i ) {
4641 C(i,i) = A(i,i) * B(i,i) * scalar;
4660 template<
typename MT3
4664 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4665 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4667 selectDefaultAssignKernel( C, A, B, scalar );
4686 template<
typename MT3
4690 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4691 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4693 typedef IntrinsicTrait<ElementType> IT;
4695 const size_t M( A.rows() );
4696 const size_t N( B.columns() );
4697 const size_t K( A.columns() );
4699 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4701 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
4704 const IntrinsicType factor(
set( scalar ) );
4709 for(
size_t i=0UL; i<M; ++i )
4711 const size_t kbegin( ( IsUpper<MT4>::value )
4712 ?( ( IsLower<MT5>::value )
4713 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4714 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4715 :( IsLower<MT5>::value ? j : 0UL ) );
4716 const size_t kend( ( IsLower<MT4>::value )
4717 ?( ( IsUpper<MT5>::value )
4718 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
4719 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4720 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
4722 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4724 for(
size_t k=kbegin; k<kend; ++k ) {
4725 const IntrinsicType a1(
set( A(i,k) ) );
4726 xmm1 = xmm1 + a1 * B.load(k,j );
4727 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
4728 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
4729 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
4730 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
4731 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
4732 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
4733 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
4736 (~C).store( i, j , xmm1 * factor );
4737 (~C).store( i, j+
IT::size , xmm2 * factor );
4738 (~C).store( i, j+
IT::size*2UL, xmm3 * factor );
4739 (~C).store( i, j+
IT::size*3UL, xmm4 * factor );
4740 (~C).store( i, j+
IT::size*4UL, xmm5 * factor );
4741 (~C).store( i, j+
IT::size*5UL, xmm6 * factor );
4742 (~C).store( i, j+
IT::size*6UL, xmm7 * factor );
4743 (~C).store( i, j+
IT::size*7UL, xmm8 * factor );
4751 for( ; (i+2UL) <= M; i+=2UL )
4753 const size_t kbegin( ( IsUpper<MT4>::value )
4754 ?( ( IsLower<MT5>::value )
4755 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4756 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4757 :( IsLower<MT5>::value ? j : 0UL ) );
4758 const size_t kend( ( IsLower<MT4>::value )
4759 ?( ( IsUpper<MT5>::value )
4760 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
4761 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4762 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
4764 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4766 for(
size_t k=kbegin; k<kend; ++k ) {
4767 const IntrinsicType a1(
set( A(i ,k) ) );
4768 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4769 const IntrinsicType b1( B.load(k,j ) );
4770 const IntrinsicType b2( B.load(k,j+
IT::size ) );
4771 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
4772 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
4773 xmm1 = xmm1 + a1 * b1;
4774 xmm2 = xmm2 + a1 * b2;
4775 xmm3 = xmm3 + a1 * b3;
4776 xmm4 = xmm4 + a1 * b4;
4777 xmm5 = xmm5 + a2 * b1;
4778 xmm6 = xmm6 + a2 * b2;
4779 xmm7 = xmm7 + a2 * b3;
4780 xmm8 = xmm8 + a2 * b4;
4783 (~C).store( i , j , xmm1 * factor );
4784 (~C).store( i , j+
IT::size , xmm2 * factor );
4785 (~C).store( i , j+
IT::size*2UL, xmm3 * factor );
4786 (~C).store( i , j+
IT::size*3UL, xmm4 * factor );
4787 (~C).store( i+1UL, j , xmm5 * factor );
4788 (~C).store( i+1UL, j+
IT::size , xmm6 * factor );
4789 (~C).store( i+1UL, j+
IT::size*2UL, xmm7 * factor );
4790 (~C).store( i+1UL, j+
IT::size*3UL, xmm8 * factor );
4795 const size_t kbegin( ( IsUpper<MT4>::value )
4796 ?( ( IsLower<MT5>::value )
4797 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4798 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4799 :( IsLower<MT5>::value ? j : 0UL ) );
4800 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
4802 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4804 for(
size_t k=kbegin; k<kend; ++k ) {
4805 const IntrinsicType a1(
set( A(i,k) ) );
4806 xmm1 = xmm1 + a1 * B.load(k,j );
4807 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
4808 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
4809 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
4812 (~C).store( i, j , xmm1 * factor );
4813 (~C).store( i, j+
IT::size , xmm2 * factor );
4814 (~C).store( i, j+
IT::size*2UL, xmm3 * factor );
4815 (~C).store( i, j+
IT::size*3UL, xmm4 * factor );
4823 for( ; (i+2UL) <= M; i+=2UL )
4825 const size_t kbegin( ( IsUpper<MT4>::value )
4826 ?( ( IsLower<MT5>::value )
4827 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4828 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4829 :( IsLower<MT5>::value ? j : 0UL ) );
4830 const size_t kend( ( IsLower<MT4>::value )
4831 ?( ( IsUpper<MT5>::value )
4832 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
4833 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4834 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
4836 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4838 for(
size_t k=kbegin; k<kend; ++k ) {
4839 const IntrinsicType a1(
set( A(i ,k) ) );
4840 const IntrinsicType a2(
set( A(i+1UL,k) ) );
4841 const IntrinsicType b1( B.load(k,j ) );
4842 const IntrinsicType b2( B.load(k,j+
IT::size) );
4843 xmm1 = xmm1 + a1 * b1;
4844 xmm2 = xmm2 + a1 * b2;
4845 xmm3 = xmm3 + a2 * b1;
4846 xmm4 = xmm4 + a2 * b2;
4849 (~C).store( i , j , xmm1 * factor );
4850 (~C).store( i , j+
IT::size, xmm2 * factor );
4851 (~C).store( i+1UL, j , xmm3 * factor );
4852 (~C).store( i+1UL, j+
IT::size, xmm4 * factor );
4857 const size_t kbegin( ( IsUpper<MT4>::value )
4858 ?( ( IsLower<MT5>::value )
4859 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4860 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4861 :( IsLower<MT5>::value ? j : 0UL ) );
4862 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
4864 IntrinsicType xmm1, xmm2;
4866 for(
size_t k=kbegin; k<kend; ++k ) {
4867 const IntrinsicType a1(
set( A(i,k) ) );
4868 xmm1 = xmm1 + a1 * B.load(k,j );
4869 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
4872 (~C).store( i, j , xmm1 * factor );
4873 (~C).store( i, j+
IT::size, xmm2 * factor );
4881 for( ; (i+2UL) <= M; i+=2UL )
4883 const size_t kbegin( ( IsUpper<MT4>::value )
4884 ?( ( IsLower<MT5>::value )
4885 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4886 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4887 :( IsLower<MT5>::value ? j : 0UL ) );
4888 const size_t kend( ( IsLower<MT4>::value )
4889 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4892 IntrinsicType xmm1, xmm2;
4894 for(
size_t k=kbegin; k<kend; ++k ) {
4895 const IntrinsicType b1( B.load(k,j) );
4896 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
4897 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
4900 (~C).store( i , j, xmm1 * factor );
4901 (~C).store( i+1UL, j, xmm2 * factor );
4906 const size_t kbegin( ( IsUpper<MT4>::value )
4907 ?( ( IsLower<MT5>::value )
4908 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4909 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4910 :( IsLower<MT5>::value ? j : 0UL ) );
4914 for(
size_t k=kbegin; k<K; ++k ) {
4915 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
4918 (~C).store( i, j, xmm1 * factor );
4922 for( ; remainder && j<N; ++j )
4926 for( ; (i+2UL) <= M; i+=2UL )
4928 const size_t kbegin( ( IsUpper<MT4>::value )
4929 ?( ( IsLower<MT5>::value )
4930 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4931 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4932 :( IsLower<MT5>::value ? j : 0UL ) );
4933 const size_t kend( ( IsLower<MT4>::value )
4934 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4940 for(
size_t k=kbegin; k<kend; ++k ) {
4941 value1 += A(i ,k) * B(k,j);
4942 value2 += A(i+1UL,k) * B(k,j);
4945 (~C)(i ,j) = value1 * scalar;
4946 (~C)(i+1UL,j) = value2 * scalar;
4951 const size_t kbegin( ( IsUpper<MT4>::value )
4952 ?( ( IsLower<MT5>::value )
4953 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4954 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4955 :( IsLower<MT5>::value ? j : 0UL ) );
4959 for(
size_t k=kbegin; k<K; ++k ) {
4960 value += A(i,k) * B(k,j);
4963 (~C)(i,j) = value * scalar;
4984 template<
typename MT3
4988 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
4989 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
4996 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
4998 assign( ~C, tmp * B * scalar );
5000 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
5002 assign( ~C, A * tmp * scalar );
5004 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
5006 assign( ~C, tmp * B * scalar );
5010 assign( ~C, A * tmp * scalar );
5029 template<
typename MT3
5033 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5034 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5036 selectDefaultAssignKernel( C, A, B, scalar );
5055 template<
typename MT3
5059 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5060 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5062 typedef IntrinsicTrait<ElementType> IT;
5064 const size_t M( A.rows() );
5065 const size_t N( B.columns() );
5066 const size_t K( A.columns() );
5068 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5070 const IntrinsicType factor(
set( scalar ) );
5072 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
5074 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
5076 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
5079 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
5081 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
5083 for(
size_t i=ii; i<iend; ++i ) {
5084 for(
size_t j=jj; j<jend; ++j ) {
5089 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
5091 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
5103 for( ; (i+2UL) <= iend; i+=2UL )
5105 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5106 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5107 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5108 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
5110 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5112 for(
size_t k=kbegin; k<kend; ++k ) {
5113 const IntrinsicType a1(
set( A(i ,k) ) );
5114 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5115 const IntrinsicType b1( B.load(k,j ) );
5116 const IntrinsicType b2( B.load(k,j1) );
5117 const IntrinsicType b3( B.load(k,j2) );
5118 const IntrinsicType b4( B.load(k,j3) );
5119 xmm1 = xmm1 + a1 * b1;
5120 xmm2 = xmm2 + a1 * b2;
5121 xmm3 = xmm3 + a1 * b3;
5122 xmm4 = xmm4 + a1 * b4;
5123 xmm5 = xmm5 + a2 * b1;
5124 xmm6 = xmm6 + a2 * b2;
5125 xmm7 = xmm7 + a2 * b3;
5126 xmm8 = xmm8 + a2 * b4;
5129 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5130 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5131 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
5132 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
5133 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5134 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
5135 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
5136 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
5141 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5142 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5143 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5144 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
5146 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5148 for(
size_t k=kbegin; k<kend; ++k ) {
5149 const IntrinsicType a1(
set( A(i,k) ) );
5150 xmm1 = xmm1 + a1 * B.load(k,j );
5151 xmm2 = xmm2 + a1 * B.load(k,j1);
5152 xmm3 = xmm3 + a1 * B.load(k,j2);
5153 xmm4 = xmm4 + a1 * B.load(k,j3);
5156 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5157 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5158 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
5159 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
5169 for( ; (i+4UL) <= iend; i+=4UL )
5171 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5172 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5173 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5174 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5176 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5178 for(
size_t k=kbegin; k<kend; ++k ) {
5179 const IntrinsicType a1(
set( A(i ,k) ) );
5180 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5181 const IntrinsicType a3(
set( A(i+2UL,k) ) );
5182 const IntrinsicType a4(
set( A(i+3UL,k) ) );
5183 const IntrinsicType b1( B.load(k,j ) );
5184 const IntrinsicType b2( B.load(k,j1) );
5185 xmm1 = xmm1 + a1 * b1;
5186 xmm2 = xmm2 + a1 * b2;
5187 xmm3 = xmm3 + a2 * b1;
5188 xmm4 = xmm4 + a2 * b2;
5189 xmm5 = xmm5 + a3 * b1;
5190 xmm6 = xmm6 + a3 * b2;
5191 xmm7 = xmm7 + a4 * b1;
5192 xmm8 = xmm8 + a4 * b2;
5195 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5196 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5197 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5198 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5199 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
5200 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
5201 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
5202 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
5205 for( ; (i+2UL) <= iend; i+=2UL )
5207 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5208 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5209 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5210 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5212 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5214 for(
size_t k=kbegin; k<kend; ++k ) {
5215 const IntrinsicType a1(
set( A(i ,k) ) );
5216 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5217 const IntrinsicType b1( B.load(k,j ) );
5218 const IntrinsicType b2( B.load(k,j1) );
5219 xmm1 = xmm1 + a1 * b1;
5220 xmm2 = xmm2 + a1 * b2;
5221 xmm3 = xmm3 + a2 * b1;
5222 xmm4 = xmm4 + a2 * b2;
5225 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5226 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
5227 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5228 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
5233 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5234 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5235 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5236 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
5238 IntrinsicType xmm1, xmm2;
5240 for(
size_t k=kbegin; k<kend; ++k ) {
5241 const IntrinsicType a1(
set( A(i,k) ) );
5242 xmm1 = xmm1 + a1 * B.load(k,j );
5243 xmm2 = xmm2 + a1 * B.load(k,j1);
5246 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5247 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
5253 for(
size_t i=ii; i<iend; ++i )
5255 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5256 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5257 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5258 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
5262 for(
size_t k=kbegin; k<kend; ++k ) {
5263 const IntrinsicType a1(
set( A(i,k) ) );
5264 xmm1 = xmm1 + a1 * B.load(k,j);
5267 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5271 for( ; remainder && j<jend; ++j )
5273 for(
size_t i=ii; i<iend; ++i )
5275 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5276 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5277 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5278 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
5282 for(
size_t k=kbegin; k<kend; ++k ) {
5283 value += A(i,k) * B(k,j);
5286 (~C)(i,j) += value * scalar;
5309 template<
typename MT3
5313 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5314 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5316 selectSmallAssignKernel( ~C, A, B, scalar );
5334 template<
typename MT3
5338 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5339 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5341 selectLargeAssignKernel( C, A, B, scalar );
5360 template<
typename MT3
5364 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
5365 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5369 if( IsTriangular<MT4>::value ) {
5371 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5373 else if( IsTriangular<MT5>::value ) {
5375 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
5378 gemm( C, A, B, ET(scalar), ET(0) );
5396 template<
typename MT
5398 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5399 assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5403 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
5415 const TmpType tmp(
serial( rhs ) );
5416 assign( ~lhs, tmp );
5434 template<
typename MT >
5435 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5436 assign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
5445 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5446 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5448 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
5449 assign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
5450 else if( IsSymmetric<MT1>::value )
5451 assign( ~lhs,
trans( left ) * right * rhs.scalar_ );
5453 assign( ~lhs, left *
trans( right ) * rhs.scalar_ );
5469 template<
typename MT
5471 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
5472 addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5479 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
5480 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
5482 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5496 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5511 template<
typename MT3
5515 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5517 if( ( IsDiagonal<MT5>::value ) ||
5518 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5519 selectSmallAddAssignKernel( C, A, B, scalar );
5521 selectBlasAddAssignKernel( C, A, B, scalar );
5539 template<
typename MT3
5543 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
5544 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5546 const ResultType tmp(
serial( A * B * scalar ) );
5547 addAssign( C, tmp );
5565 template<
typename MT3
5569 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
5570 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5574 const size_t M( A.rows() );
5575 const size_t N( B.columns() );
5577 for(
size_t i=0UL; i<M; ++i )
5579 const size_t jbegin( ( IsUpper<MT4>::value )
5580 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
5582 const size_t jend( ( IsLower<MT4>::value )
5583 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
5587 const size_t jnum( jend - jbegin );
5588 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5590 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5591 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
5592 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
5595 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
5615 template<
typename MT3
5619 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
5620 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5624 const size_t M( A.rows() );
5625 const size_t N( B.columns() );
5627 for(
size_t i=0UL; i<M; ++i )
5629 const size_t jbegin( ( IsUpper<MT5>::value )
5630 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
5632 const size_t jend( ( IsLower<MT5>::value )
5633 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
5637 const size_t jnum( jend - jbegin );
5638 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5640 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5641 C(i,j ) += A(i,i) * B(i,j ) * scalar;
5642 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
5645 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
5665 template<
typename MT3
5669 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
5670 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5674 for(
size_t i=0UL; i<A.rows(); ++i ) {
5675 C(i,i) += A(i,i) * B(i,i) * scalar;
5694 template<
typename MT3
5698 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5699 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5701 selectDefaultAddAssignKernel( C, A, B, scalar );
5720 template<
typename MT3
5724 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
5725 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
5727 typedef IntrinsicTrait<ElementType> IT;
5729 const size_t M( A.rows() );
5730 const size_t N( B.columns() );
5731 const size_t K( A.columns() );
5733 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5735 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
5738 const IntrinsicType factor(
set( scalar ) );
5743 for(
size_t i=0UL; i<M; ++i )
5745 const size_t kbegin( ( IsUpper<MT4>::value )
5746 ?( ( IsLower<MT5>::value )
5747 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5748 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5749 :( IsLower<MT5>::value ? j : 0UL ) );
5750 const size_t kend( ( IsLower<MT4>::value )
5751 ?( ( IsUpper<MT5>::value )
5752 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
5753 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
5754 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
5756 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5758 for(
size_t k=kbegin; k<kend; ++k ) {
5759 const IntrinsicType a1(
set( A(i,k) ) );
5760 xmm1 = xmm1 + a1 * B.load(k,j );
5761 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
5762 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
5763 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
5764 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
5765 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
5766 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
5767 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
5770 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5772 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) + xmm3 * factor );
5773 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) + xmm4 * factor );
5774 (~C).store( i, j+
IT::size*4UL, (~C).load(i,j+
IT::size*4UL) + xmm5 * factor );
5775 (~C).store( i, j+
IT::size*5UL, (~C).load(i,j+
IT::size*5UL) + xmm6 * factor );
5776 (~C).store( i, j+
IT::size*6UL, (~C).load(i,j+
IT::size*6UL) + xmm7 * factor );
5777 (~C).store( i, j+
IT::size*7UL, (~C).load(i,j+
IT::size*7UL) + xmm8 * factor );
5785 for( ; (i+2UL) <= M; i+=2UL )
5787 const size_t kbegin( ( IsUpper<MT4>::value )
5788 ?( ( IsLower<MT5>::value )
5789 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5790 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5791 :( IsLower<MT5>::value ? j : 0UL ) );
5792 const size_t kend( ( IsLower<MT4>::value )
5793 ?( ( IsUpper<MT5>::value )
5794 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
5795 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5796 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
5798 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5800 for(
size_t k=kbegin; k<kend; ++k ) {
5801 const IntrinsicType a1(
set( A(i ,k) ) );
5802 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5803 const IntrinsicType b1( B.load(k,j ) );
5804 const IntrinsicType b2( B.load(k,j+
IT::size ) );
5805 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
5806 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
5807 xmm1 = xmm1 + a1 * b1;
5808 xmm2 = xmm2 + a1 * b2;
5809 xmm3 = xmm3 + a1 * b3;
5810 xmm4 = xmm4 + a1 * b4;
5811 xmm5 = xmm5 + a2 * b1;
5812 xmm6 = xmm6 + a2 * b2;
5813 xmm7 = xmm7 + a2 * b3;
5814 xmm8 = xmm8 + a2 * b4;
5817 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5819 (~C).store( i , j+
IT::size*2UL, (~C).load(i ,j+
IT::size*2UL) + xmm3 * factor );
5820 (~C).store( i , j+
IT::size*3UL, (~C).load(i ,j+
IT::size*3UL) + xmm4 * factor );
5821 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
5822 (~C).store( i+1UL, j+
IT::size , (~C).load(i+1UL,j+
IT::size ) + xmm6 * factor );
5823 (~C).store( i+1UL, j+
IT::size*2UL, (~C).load(i+1UL,j+
IT::size*2UL) + xmm7 * factor );
5824 (~C).store( i+1UL, j+
IT::size*3UL, (~C).load(i+1UL,j+
IT::size*3UL) + xmm8 * factor );
5829 const size_t kbegin( ( IsUpper<MT4>::value )
5830 ?( ( IsLower<MT5>::value )
5831 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5832 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5833 :( IsLower<MT5>::value ? j : 0UL ) );
5834 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
5836 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5838 for(
size_t k=kbegin; k<kend; ++k ) {
5839 const IntrinsicType a1(
set( A(i,k) ) );
5840 xmm1 = xmm1 + a1 * B.load(k,j );
5841 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
5842 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
5843 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
5846 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5848 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) + xmm3 * factor );
5849 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) + xmm4 * factor );
5857 for( ; (i+2UL) <= M; i+=2UL )
5859 const size_t kbegin( ( IsUpper<MT4>::value )
5860 ?( ( IsLower<MT5>::value )
5861 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5862 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5863 :( IsLower<MT5>::value ? j : 0UL ) );
5864 const size_t kend( ( IsLower<MT4>::value )
5865 ?( ( IsUpper<MT5>::value )
5866 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
5867 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
5868 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
5870 IntrinsicType xmm1, xmm2, xmm3, xmm4;
5872 for(
size_t k=kbegin; k<kend; ++k ) {
5873 const IntrinsicType a1(
set( A(i ,k) ) );
5874 const IntrinsicType a2(
set( A(i+1UL,k) ) );
5875 const IntrinsicType b1( B.load(k,j ) );
5876 const IntrinsicType b2( B.load(k,j+
IT::size) );
5877 xmm1 = xmm1 + a1 * b1;
5878 xmm2 = xmm2 + a1 * b2;
5879 xmm3 = xmm3 + a2 * b1;
5880 xmm4 = xmm4 + a2 * b2;
5883 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
5885 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
5886 (~C).store( i+1UL, j+
IT::size, (~C).load(i+1UL,j+
IT::size) + xmm4 * factor );
5891 const size_t kbegin( ( IsUpper<MT4>::value )
5892 ?( ( IsLower<MT5>::value )
5893 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5894 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5895 :( IsLower<MT5>::value ? j : 0UL ) );
5896 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
5898 IntrinsicType xmm1, xmm2;
5900 for(
size_t k=kbegin; k<kend; ++k ) {
5901 const IntrinsicType a1(
set( A(i,k) ) );
5902 xmm1 = xmm1 + a1 * B.load(k,j );
5903 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
5906 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
5915 for( ; (i+2UL) <= M; i+=2UL )
5917 const size_t kbegin( ( IsUpper<MT4>::value )
5918 ?( ( IsLower<MT5>::value )
5919 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5920 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5921 :( IsLower<MT5>::value ? j : 0UL ) );
5922 const size_t kend( ( IsLower<MT4>::value )
5923 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5926 IntrinsicType xmm1, xmm2;
5928 for(
size_t k=kbegin; k<kend; ++k ) {
5929 const IntrinsicType b1( B.load(k,j) );
5930 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
5931 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
5934 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
5935 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
5940 const size_t kbegin( ( IsUpper<MT4>::value )
5941 ?( ( IsLower<MT5>::value )
5942 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5943 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5944 :( IsLower<MT5>::value ? j : 0UL ) );
5948 for(
size_t k=kbegin; k<K; ++k ) {
5949 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
5952 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
5956 for( ; remainder && j<N; ++j )
5960 for( ; (i+2UL) <= M; i+=2UL )
5962 const size_t kbegin( ( IsUpper<MT4>::value )
5963 ?( ( IsLower<MT5>::value )
5964 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5965 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5966 :( IsLower<MT5>::value ? j : 0UL ) );
5967 const size_t kend( ( IsLower<MT4>::value )
5968 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
5974 for(
size_t k=kbegin; k<kend; ++k ) {
5975 value1 += A(i ,k) * B(k,j);
5976 value2 += A(i+1UL,k) * B(k,j);
5979 (~C)(i ,j) += value1 * scalar;
5980 (~C)(i+1UL,j) += value2 * scalar;
5985 const size_t kbegin( ( IsUpper<MT4>::value )
5986 ?( ( IsLower<MT5>::value )
5987 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
5988 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
5989 :( IsLower<MT5>::value ? j : 0UL ) );
5993 for(
size_t k=kbegin; k<K; ++k ) {
5994 value += A(i,k) * B(k,j);
5997 (~C)(i,j) += value * scalar;
6018 template<
typename MT3
6022 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6023 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6030 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
6032 addAssign( ~C, tmp * B * scalar );
6034 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
6036 addAssign( ~C, A * tmp * scalar );
6038 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6040 addAssign( ~C, tmp * B * scalar );
6044 addAssign( ~C, A * tmp * scalar );
6063 template<
typename MT3
6067 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6068 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6070 selectDefaultAddAssignKernel( C, A, B, scalar );
6089 template<
typename MT3
6093 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6094 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6096 typedef IntrinsicTrait<ElementType> IT;
6098 const size_t M( A.rows() );
6099 const size_t N( B.columns() );
6100 const size_t K( A.columns() );
6102 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6104 const IntrinsicType factor(
set( scalar ) );
6106 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
6108 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
6110 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
6113 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
6115 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
6117 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
6119 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
6131 for( ; (i+2UL) <= iend; i+=2UL )
6133 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6134 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6135 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6136 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
6138 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6140 for(
size_t k=kbegin; k<kend; ++k ) {
6141 const IntrinsicType a1(
set( A(i ,k) ) );
6142 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6143 const IntrinsicType b1( B.load(k,j ) );
6144 const IntrinsicType b2( B.load(k,j1) );
6145 const IntrinsicType b3( B.load(k,j2) );
6146 const IntrinsicType b4( B.load(k,j3) );
6147 xmm1 = xmm1 + a1 * b1;
6148 xmm2 = xmm2 + a1 * b2;
6149 xmm3 = xmm3 + a1 * b3;
6150 xmm4 = xmm4 + a1 * b4;
6151 xmm5 = xmm5 + a2 * b1;
6152 xmm6 = xmm6 + a2 * b2;
6153 xmm7 = xmm7 + a2 * b3;
6154 xmm8 = xmm8 + a2 * b4;
6157 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6158 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6159 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
6160 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
6161 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
6162 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
6163 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
6164 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
6169 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6170 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6171 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6172 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
6174 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6176 for(
size_t k=kbegin; k<kend; ++k ) {
6177 const IntrinsicType a1(
set( A(i,k) ) );
6178 xmm1 = xmm1 + a1 * B.load(k,j );
6179 xmm2 = xmm2 + a1 * B.load(k,j1);
6180 xmm3 = xmm3 + a1 * B.load(k,j2);
6181 xmm4 = xmm4 + a1 * B.load(k,j3);
6184 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6185 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6186 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
6187 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
6197 for( ; (i+4UL) <= iend; i+=4UL )
6199 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6200 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6201 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
6202 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
6204 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6206 for(
size_t k=kbegin; k<kend; ++k ) {
6207 const IntrinsicType a1(
set( A(i ,k) ) );
6208 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6209 const IntrinsicType a3(
set( A(i+2UL,k) ) );
6210 const IntrinsicType a4(
set( A(i+3UL,k) ) );
6211 const IntrinsicType b1( B.load(k,j ) );
6212 const IntrinsicType b2( B.load(k,j1) );
6213 xmm1 = xmm1 + a1 * b1;
6214 xmm2 = xmm2 + a1 * b2;
6215 xmm3 = xmm3 + a2 * b1;
6216 xmm4 = xmm4 + a2 * b2;
6217 xmm5 = xmm5 + a3 * b1;
6218 xmm6 = xmm6 + a3 * b2;
6219 xmm7 = xmm7 + a4 * b1;
6220 xmm8 = xmm8 + a4 * b2;
6223 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6224 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6225 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6226 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6227 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
6228 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
6229 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
6230 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
6233 for( ; (i+2UL) <= iend; i+=2UL )
6235 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6236 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6237 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
6238 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
6240 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6242 for(
size_t k=kbegin; k<kend; ++k ) {
6243 const IntrinsicType a1(
set( A(i ,k) ) );
6244 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6245 const IntrinsicType b1( B.load(k,j ) );
6246 const IntrinsicType b2( B.load(k,j1) );
6247 xmm1 = xmm1 + a1 * b1;
6248 xmm2 = xmm2 + a1 * b2;
6249 xmm3 = xmm3 + a2 * b1;
6250 xmm4 = xmm4 + a2 * b2;
6253 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
6254 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
6255 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
6256 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
6261 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6262 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6263 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6264 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
6266 IntrinsicType xmm1, xmm2;
6268 for(
size_t k=kbegin; k<kend; ++k ) {
6269 const IntrinsicType a1(
set( A(i,k) ) );
6270 xmm1 = xmm1 + a1 * B.load(k,j );
6271 xmm2 = xmm2 + a1 * B.load(k,j1);
6274 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
6275 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
6281 for(
size_t i=ii; i<iend; ++i )
6283 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6284 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6285 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6286 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
6290 for(
size_t k=kbegin; k<kend; ++k ) {
6291 const IntrinsicType a1(
set( A(i,k) ) );
6292 xmm1 = xmm1 + a1 * B.load(k,j);
6295 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
6299 for( ; remainder && j<jend; ++j )
6301 for(
size_t i=ii; i<iend; ++i )
6303 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
6304 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
6305 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
6306 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
6310 for(
size_t k=kbegin; k<kend; ++k ) {
6311 value += A(i,k) * B(k,j);
6314 (~C)(i,j) += value * scalar;
6337 template<
typename MT3
6341 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6342 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6344 selectSmallAddAssignKernel( ~C, A, B, scalar );
6362 template<
typename MT3
6366 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6367 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6369 selectLargeAddAssignKernel( C, A, B, scalar );
6388 template<
typename MT3
6392 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
6393 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6397 if( IsTriangular<MT4>::value ) {
6399 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6400 addAssign( C, tmp );
6402 else if( IsTriangular<MT5>::value ) {
6404 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
6405 addAssign( C, tmp );
6408 gemm( C, A, B, ET(scalar), ET(1) );
6428 template<
typename MT >
6429 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6430 addAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
6439 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6440 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6442 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
6443 addAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
6444 else if( IsSymmetric<MT1>::value )
6445 addAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
6447 addAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
6467 template<
typename MT
6469 friend inline typename DisableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
6470 subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6477 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
6478 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
6480 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6494 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6509 template<
typename MT3
6513 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6515 if( ( IsDiagonal<MT5>::value ) ||
6516 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6517 selectSmallSubAssignKernel( C, A, B, scalar );
6519 selectBlasSubAssignKernel( C, A, B, scalar );
6537 template<
typename MT3
6541 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >::Type
6542 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6544 const ResultType tmp(
serial( A * B * scalar ) );
6545 subAssign( C, tmp );
6563 template<
typename MT3
6567 static inline typename EnableIf< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >::Type
6568 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6572 const size_t M( A.rows() );
6573 const size_t N( B.columns() );
6575 for(
size_t i=0UL; i<M; ++i )
6577 const size_t jbegin( ( IsUpper<MT4>::value )
6578 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6580 const size_t jend( ( IsLower<MT4>::value )
6581 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6585 const size_t jnum( jend - jbegin );
6586 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6588 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6589 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
6590 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6593 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
6613 template<
typename MT3
6617 static inline typename EnableIf< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >::Type
6618 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6622 const size_t M( A.rows() );
6623 const size_t N( B.columns() );
6625 for(
size_t i=0UL; i<M; ++i )
6627 const size_t jbegin( ( IsUpper<MT5>::value )
6628 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6630 const size_t jend( ( IsLower<MT5>::value )
6631 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6635 const size_t jnum( jend - jbegin );
6636 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6638 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6639 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
6640 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
6643 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
6663 template<
typename MT3
6667 static inline typename EnableIf< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >::Type
6668 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6672 for(
size_t i=0UL; i<A.rows(); ++i ) {
6673 C(i,i) -= A(i,i) * B(i,i) * scalar;
6692 template<
typename MT3
6696 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6697 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6699 selectDefaultSubAssignKernel( C, A, B, scalar );
6718 template<
typename MT3
6722 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
6723 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6725 typedef IntrinsicTrait<ElementType> IT;
6727 const size_t M( A.rows() );
6728 const size_t N( B.columns() );
6729 const size_t K( A.columns() );
6731 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6733 const size_t jpos( remainder ? ( N &
size_t(-
IT::size) ) : N );
6736 const IntrinsicType factor(
set( scalar ) );
6741 for(
size_t i=0UL; i<M; ++i )
6743 const size_t kbegin( ( IsUpper<MT4>::value )
6744 ?( ( IsLower<MT5>::value )
6745 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6746 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6747 :( IsLower<MT5>::value ? j : 0UL ) );
6748 const size_t kend( ( IsLower<MT4>::value )
6749 ?( ( IsUpper<MT5>::value )
6750 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+
IT::size*8UL, K ) )
6751 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6752 :( IsUpper<MT5>::value ?
min( j+
IT::size*8UL, K ) : K ) );
6754 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6756 for(
size_t k=kbegin; k<kend; ++k ) {
6757 const IntrinsicType a1(
set( A(i,k) ) );
6758 xmm1 = xmm1 + a1 * B.load(k,j );
6759 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6760 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6761 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6762 xmm5 = xmm5 + a1 * B.load(k,j+
IT::size*4UL);
6763 xmm6 = xmm6 + a1 * B.load(k,j+
IT::size*5UL);
6764 xmm7 = xmm7 + a1 * B.load(k,j+
IT::size*6UL);
6765 xmm8 = xmm8 + a1 * B.load(k,j+
IT::size*7UL);
6768 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6770 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) - xmm3 * factor );
6771 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) - xmm4 * factor );
6772 (~C).store( i, j+
IT::size*4UL, (~C).load(i,j+
IT::size*4UL) - xmm5 * factor );
6773 (~C).store( i, j+
IT::size*5UL, (~C).load(i,j+
IT::size*5UL) - xmm6 * factor );
6774 (~C).store( i, j+
IT::size*6UL, (~C).load(i,j+
IT::size*6UL) - xmm7 * factor );
6775 (~C).store( i, j+
IT::size*7UL, (~C).load(i,j+
IT::size*7UL) - xmm8 * factor );
6783 for( ; (i+2UL) <= M; i+=2UL )
6785 const size_t kbegin( ( IsUpper<MT4>::value )
6786 ?( ( IsLower<MT5>::value )
6787 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6788 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6789 :( IsLower<MT5>::value ? j : 0UL ) );
6790 const size_t kend( ( IsLower<MT4>::value )
6791 ?( ( IsUpper<MT5>::value )
6792 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*4UL, K ) )
6793 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6794 :( IsUpper<MT5>::value ?
min( j+
IT::size*4UL, K ) : K ) );
6796 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6798 for(
size_t k=kbegin; k<kend; ++k ) {
6799 const IntrinsicType a1(
set( A(i ,k) ) );
6800 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6801 const IntrinsicType b1( B.load(k,j ) );
6802 const IntrinsicType b2( B.load(k,j+
IT::size ) );
6803 const IntrinsicType b3( B.load(k,j+
IT::size*2UL) );
6804 const IntrinsicType b4( B.load(k,j+
IT::size*3UL) );
6805 xmm1 = xmm1 + a1 * b1;
6806 xmm2 = xmm2 + a1 * b2;
6807 xmm3 = xmm3 + a1 * b3;
6808 xmm4 = xmm4 + a1 * b4;
6809 xmm5 = xmm5 + a2 * b1;
6810 xmm6 = xmm6 + a2 * b2;
6811 xmm7 = xmm7 + a2 * b3;
6812 xmm8 = xmm8 + a2 * b4;
6815 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6817 (~C).store( i , j+
IT::size*2UL, (~C).load(i ,j+
IT::size*2UL) - xmm3 * factor );
6818 (~C).store( i , j+
IT::size*3UL, (~C).load(i ,j+
IT::size*3UL) - xmm4 * factor );
6819 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
6820 (~C).store( i+1UL, j+
IT::size , (~C).load(i+1UL,j+
IT::size ) - xmm6 * factor );
6821 (~C).store( i+1UL, j+
IT::size*2UL, (~C).load(i+1UL,j+
IT::size*2UL) - xmm7 * factor );
6822 (~C).store( i+1UL, j+
IT::size*3UL, (~C).load(i+1UL,j+
IT::size*3UL) - xmm8 * factor );
6827 const size_t kbegin( ( IsUpper<MT4>::value )
6828 ?( ( IsLower<MT5>::value )
6829 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6830 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6831 :( IsLower<MT5>::value ? j : 0UL ) );
6832 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, K ) ):( K ) );
6834 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6836 for(
size_t k=kbegin; k<kend; ++k ) {
6837 const IntrinsicType a1(
set( A(i,k) ) );
6838 xmm1 = xmm1 + a1 * B.load(k,j );
6839 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size );
6840 xmm3 = xmm3 + a1 * B.load(k,j+
IT::size*2UL);
6841 xmm4 = xmm4 + a1 * B.load(k,j+
IT::size*3UL);
6844 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6846 (~C).store( i, j+
IT::size*2UL, (~C).load(i,j+
IT::size*2UL) - xmm3 * factor );
6847 (~C).store( i, j+
IT::size*3UL, (~C).load(i,j+
IT::size*3UL) - xmm4 * factor );
6855 for( ; (i+2UL) <= M; i+=2UL )
6857 const size_t kbegin( ( IsUpper<MT4>::value )
6858 ?( ( IsLower<MT5>::value )
6859 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6860 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6861 :( IsLower<MT5>::value ? j : 0UL ) );
6862 const size_t kend( ( IsLower<MT4>::value )
6863 ?( ( IsUpper<MT5>::value )
6864 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+
IT::size*2UL, K ) )
6865 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6866 :( IsUpper<MT5>::value ?
min( j+
IT::size*2UL, K ) : K ) );
6868 IntrinsicType xmm1, xmm2, xmm3, xmm4;
6870 for(
size_t k=kbegin; k<kend; ++k ) {
6871 const IntrinsicType a1(
set( A(i ,k) ) );
6872 const IntrinsicType a2(
set( A(i+1UL,k) ) );
6873 const IntrinsicType b1( B.load(k,j ) );
6874 const IntrinsicType b2( B.load(k,j+
IT::size) );
6875 xmm1 = xmm1 + a1 * b1;
6876 xmm2 = xmm2 + a1 * b2;
6877 xmm3 = xmm3 + a2 * b1;
6878 xmm4 = xmm4 + a2 * b2;
6881 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
6883 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
6884 (~C).store( i+1UL, j+
IT::size, (~C).load(i+1UL,j+
IT::size) - xmm4 * factor );
6889 const size_t kbegin( ( IsUpper<MT4>::value )
6890 ?( ( IsLower<MT5>::value )
6891 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6892 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6893 :( IsLower<MT5>::value ? j : 0UL ) );
6894 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, K ) ):( K ) );
6896 IntrinsicType xmm1, xmm2;
6898 for(
size_t k=kbegin; k<kend; ++k ) {
6899 const IntrinsicType a1(
set( A(i,k) ) );
6900 xmm1 = xmm1 + a1 * B.load(k,j );
6901 xmm2 = xmm2 + a1 * B.load(k,j+
IT::size);
6904 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
6913 for( ; (i+2UL) <= M; i+=2UL )
6915 const size_t kbegin( ( IsUpper<MT4>::value )
6916 ?( ( IsLower<MT5>::value )
6917 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6918 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6919 :( IsLower<MT5>::value ? j : 0UL ) );
6920 const size_t kend( ( IsLower<MT4>::value )
6921 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6924 IntrinsicType xmm1, xmm2;
6926 for(
size_t k=kbegin; k<kend; ++k ) {
6927 const IntrinsicType b1( B.load(k,j) );
6928 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
6929 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
6932 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
6933 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
6938 const size_t kbegin( ( IsUpper<MT4>::value )
6939 ?( ( IsLower<MT5>::value )
6940 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6941 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6942 :( IsLower<MT5>::value ? j : 0UL ) );
6946 for(
size_t k=kbegin; k<K; ++k ) {
6947 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
6950 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
6954 for( ; remainder && j<N; ++j )
6958 for( ; (i+2UL) <= M; i+=2UL )
6960 const size_t kbegin( ( IsUpper<MT4>::value )
6961 ?( ( IsLower<MT5>::value )
6962 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6963 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6964 :( IsLower<MT5>::value ? j : 0UL ) );
6965 const size_t kend( ( IsLower<MT4>::value )
6966 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6972 for(
size_t k=kbegin; k<kend; ++k ) {
6973 value1 += A(i ,k) * B(k,j);
6974 value2 += A(i+1UL,k) * B(k,j);
6977 (~C)(i ,j) -= value1 * scalar;
6978 (~C)(i+1UL,j) -= value2 * scalar;
6983 const size_t kbegin( ( IsUpper<MT4>::value )
6984 ?( ( IsLower<MT5>::value )
6985 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6986 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6987 :( IsLower<MT5>::value ? j : 0UL ) );
6991 for(
size_t k=kbegin; k<K; ++k ) {
6992 value += A(i,k) * B(k,j);
6995 (~C)(i,j) -= value * scalar;
7015 template<
typename MT3
7019 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7020 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7027 if( !IsResizable<MT4>::value && IsResizable<MT5>::value ) {
7029 subAssign( ~C, tmp * B * scalar );
7031 else if( IsResizable<MT4>::value && !IsResizable<MT5>::value ) {
7033 subAssign( ~C, A * tmp * scalar );
7035 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7037 subAssign( ~C, tmp * B * scalar );
7041 subAssign( ~C, A * tmp * scalar );
7060 template<
typename MT3
7064 static inline typename DisableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7065 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7067 selectDefaultSubAssignKernel( C, A, B, scalar );
7086 template<
typename MT3
7090 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7091 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7093 typedef IntrinsicTrait<ElementType> IT;
7095 const size_t M( A.rows() );
7096 const size_t N( B.columns() );
7097 const size_t K( A.columns() );
7099 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7101 const IntrinsicType factor(
set( scalar ) );
7103 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_JBLOCK_SIZE )
7105 const size_t jend(
min( jj+DMATDMATMULT_JBLOCK_SIZE, N ) );
7107 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
7110 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_IBLOCK_SIZE )
7112 const size_t iend(
min( ii+DMATDMATMULT_IBLOCK_SIZE, M ) );
7114 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_KBLOCK_SIZE )
7116 const size_t ktmp(
min( kk+DMATDMATMULT_KBLOCK_SIZE, K ) );
7128 for( ; (i+2UL) <= iend; i+=2UL )
7130 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7131 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7132 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7133 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7135 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7137 for(
size_t k=kbegin; k<kend; ++k ) {
7138 const IntrinsicType a1(
set( A(i ,k) ) );
7139 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7140 const IntrinsicType b1( B.load(k,j ) );
7141 const IntrinsicType b2( B.load(k,j1) );
7142 const IntrinsicType b3( B.load(k,j2) );
7143 const IntrinsicType b4( B.load(k,j3) );
7144 xmm1 = xmm1 + a1 * b1;
7145 xmm2 = xmm2 + a1 * b2;
7146 xmm3 = xmm3 + a1 * b3;
7147 xmm4 = xmm4 + a1 * b4;
7148 xmm5 = xmm5 + a2 * b1;
7149 xmm6 = xmm6 + a2 * b2;
7150 xmm7 = xmm7 + a2 * b3;
7151 xmm8 = xmm8 + a2 * b4;
7154 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7155 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7156 (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
7157 (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
7158 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
7159 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
7160 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
7161 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
7166 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7167 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7168 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7169 ( IsUpper<MT5>::value )?(
min( j+
IT::size*4UL, ktmp ) ):( ktmp ) ) );
7171 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7173 for(
size_t k=kbegin; k<kend; ++k ) {
7174 const IntrinsicType a1(
set( A(i,k) ) );
7175 xmm1 = xmm1 + a1 * B.load(k,j );
7176 xmm2 = xmm2 + a1 * B.load(k,j1);
7177 xmm3 = xmm3 + a1 * B.load(k,j2);
7178 xmm4 = xmm4 + a1 * B.load(k,j3);
7181 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7182 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7183 (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
7184 (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
7194 for( ; (i+4UL) <= iend; i+=4UL )
7196 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7197 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7198 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7199 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7201 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7203 for(
size_t k=kbegin; k<kend; ++k ) {
7204 const IntrinsicType a1(
set( A(i ,k) ) );
7205 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7206 const IntrinsicType a3(
set( A(i+2UL,k) ) );
7207 const IntrinsicType a4(
set( A(i+3UL,k) ) );
7208 const IntrinsicType b1( B.load(k,j ) );
7209 const IntrinsicType b2( B.load(k,j1) );
7210 xmm1 = xmm1 + a1 * b1;
7211 xmm2 = xmm2 + a1 * b2;
7212 xmm3 = xmm3 + a2 * b1;
7213 xmm4 = xmm4 + a2 * b2;
7214 xmm5 = xmm5 + a3 * b1;
7215 xmm6 = xmm6 + a3 * b2;
7216 xmm7 = xmm7 + a4 * b1;
7217 xmm8 = xmm8 + a4 * b2;
7220 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7221 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7222 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7223 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7224 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
7225 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
7226 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
7227 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
7230 for( ; (i+2UL) <= iend; i+=2UL )
7232 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7233 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7234 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7235 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7237 IntrinsicType xmm1, xmm2, xmm3, xmm4;
7239 for(
size_t k=kbegin; k<kend; ++k ) {
7240 const IntrinsicType a1(
set( A(i ,k) ) );
7241 const IntrinsicType a2(
set( A(i+1UL,k) ) );
7242 const IntrinsicType b1( B.load(k,j ) );
7243 const IntrinsicType b2( B.load(k,j1) );
7244 xmm1 = xmm1 + a1 * b1;
7245 xmm2 = xmm2 + a1 * b2;
7246 xmm3 = xmm3 + a2 * b1;
7247 xmm4 = xmm4 + a2 * b2;
7250 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
7251 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
7252 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
7253 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
7258 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7259 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7260 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7261 ( IsUpper<MT5>::value )?(
min( j+
IT::size*2UL, ktmp ) ):( ktmp ) ) );
7263 IntrinsicType xmm1, xmm2;
7265 for(
size_t k=kbegin; k<kend; ++k ) {
7266 const IntrinsicType a1(
set( A(i,k) ) );
7267 xmm1 = xmm1 + a1 * B.load(k,j );
7268 xmm2 = xmm2 + a1 * B.load(k,j1);
7271 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
7272 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
7278 for(
size_t i=ii; i<iend; ++i )
7280 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7281 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7282 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7283 ( IsUpper<MT5>::value )?(
min( j+
IT::size, ktmp ) ):( ktmp ) ) );
7287 for(
size_t k=kbegin; k<kend; ++k ) {
7288 const IntrinsicType a1(
set( A(i,k) ) );
7289 xmm1 = xmm1 + a1 * B.load(k,j);
7292 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
7296 for( ; remainder && j<jend; ++j )
7298 for(
size_t i=ii; i<iend; ++i )
7300 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7301 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7302 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7303 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
7307 for(
size_t k=kbegin; k<kend; ++k ) {
7308 value += A(i,k) * B(k,j);
7311 (~C)(i,j) -= value * scalar;
7334 template<
typename MT3
7338 static inline typename EnableIf< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >::Type
7339 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7341 selectSmallSubAssignKernel( ~C, A, B, scalar );
7359 template<
typename MT3
7363 static inline typename DisableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7364 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7366 selectLargeSubAssignKernel( C, A, B, scalar );
7385 template<
typename MT3
7389 static inline typename EnableIf< UseBlasKernel<MT3,MT4,MT5,ST2> >::Type
7390 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7394 if( IsTriangular<MT4>::value ) {
7396 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7397 subAssign( C, tmp );
7399 else if( IsTriangular<MT5>::value ) {
7401 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7402 subAssign( C, tmp );
7405 gemm( C, A, B, ET(-scalar), ET(1) );
7425 template<
typename MT >
7426 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7427 subAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7436 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7437 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7439 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7440 subAssign( ~lhs,
trans( left ) *
trans( right ) * rhs.scalar_ );
7441 else if( IsSymmetric<MT1>::value )
7442 subAssign( ~lhs,
trans( left ) * right * rhs.scalar_ );
7444 subAssign( ~lhs, left *
trans( right ) * rhs.scalar_ );
7475 template<
typename MT
7477 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7478 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7485 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7486 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7488 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7491 else if( left.columns() == 0UL ) {
7525 template<
typename MT
7527 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7528 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7532 typedef typename SelectType< SO, OppositeType, ResultType >::Type TmpType;
7544 const TmpType tmp( rhs );
7563 template<
typename MT >
7564 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7565 smpAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7574 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7575 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7577 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7579 else if( IsSymmetric<MT1>::value )
7601 template<
typename MT
7603 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7604 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7611 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7612 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7614 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7646 template<
typename MT >
7647 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7648 smpAddAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7657 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7658 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7660 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7662 else if( IsSymmetric<MT1>::value )
7688 template<
typename MT
7690 friend inline typename EnableIf< IsEvaluationRequired<MT,MT1,MT2> >::Type
7691 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7698 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7699 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7701 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7733 template<
typename MT >
7734 friend inline typename EnableIf< CanExploitSymmetry<MT,MT1,MT2> >::Type
7735 smpSubAssign( Matrix<MT,true>& lhs,
const DMatScalarMultExpr& rhs )
7744 typename MMM::LeftOperand left ( rhs.matrix_.leftOperand() );
7745 typename MMM::RightOperand right( rhs.matrix_.rightOperand() );
7747 if( IsSymmetric<MT1>::value && IsSymmetric<MT2>::value )
7749 else if( IsSymmetric<MT1>::value )
7817 template<
typename T1
7819 inline const DMatDMatMultExpr<T1,T2>
7843 template<
typename MT1,
typename MT2 >
7860 template<
typename MT1,
typename MT2 >
7877 template<
typename MT1,
typename MT2 >
7879 :
public IsTrue< And< IsAligned<MT1>, IsAligned<MT2> >::value >
7895 template<
typename MT1,
typename MT2 >
7897 :
public IsTrue< And< IsLower<MT1>, IsLower<MT2> >::value >
7913 template<
typename MT1,
typename MT2 >
7915 :
public IsTrue< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
7931 template<
typename MT1,
typename MT2 >
7933 :
public IsTrue< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
7934 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
7950 template<
typename MT1,
typename MT2 >
7952 :
public IsTrue< And< IsUpper<MT1>, IsUpper<MT2> >::value >
7968 template<
typename MT1,
typename MT2 >
7970 :
public IsTrue< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
7986 template<
typename MT1,
typename MT2 >
7988 :
public IsTrue< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
7989 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
8005 template<
typename MT1,
typename MT2,
typename VT >
8010 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8011 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
8012 IsDenseVector<VT>::value && IsColumnVector<VT>::value
8013 ,
typename DMatDVecMultExprTrait< MT1, typename DMatDVecMultExprTrait<MT2,VT>::Type >::Type
8014 , INVALID_TYPE >::Type Type;
8023 template<
typename MT1,
typename MT2,
typename VT >
8028 typedef typename SelectType< IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8029 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value &&
8030 IsSparseVector<VT>::value && IsColumnVector<VT>::value
8031 ,
typename DMatDVecMultExprTrait< MT1, typename DMatSVecMultExprTrait<MT2,VT>::Type >::Type
8032 , INVALID_TYPE >::Type Type;
8041 template<
typename VT,
typename MT1,
typename MT2 >
8046 typedef typename SelectType< IsDenseVector<VT>::value && IsRowVector<VT>::value &&
8047 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8048 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
8049 ,
typename TDVecDMatMultExprTrait< typename TDVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8050 , INVALID_TYPE >::Type Type;
8059 template<
typename VT,
typename MT1,
typename MT2 >
8064 typedef typename SelectType< IsSparseVector<VT>::value && IsRowVector<VT>::value &&
8065 IsDenseMatrix<MT1>::value && IsRowMajorMatrix<MT1>::value &&
8066 IsDenseMatrix<MT2>::value && IsRowMajorMatrix<MT2>::value
8067 ,
typename TDVecDMatMultExprTrait< typename TSVecDMatMultExprTrait<VT,MT1>::Type, MT2 >::Type
8068 , INVALID_TYPE >::Type Type;
8077 template<
typename MT1,
typename MT2,
bool AF >
8082 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT1,AF>::Type
8083 ,
typename SubmatrixExprTrait<const MT2,AF>::Type >::Type Type;
8092 template<
typename MT1,
typename MT2 >
8097 typedef typename MultExprTrait< typename RowExprTrait<const MT1>::Type, MT2 >::Type Type;
8106 template<
typename MT1,
typename MT2 >
8111 typedef typename MultExprTrait< MT1, typename ColumnExprTrait<const MT2>::Type >::Type Type;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
const MT::ElementType max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1729
MT2::CompositeType CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:155
BLAZE_ALWAYS_INLINE EnableIf< And< IsIntegral< T >, HasSize< T, 2UL > >, simd_int16_t >::Type set(T value)
Sets all values in the vector to the given 2-byte integral value.
Definition: Set.h:73
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:105
Header file for basic type definitions.
Header file for the SparseVector base class.
RT1::ElementType ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:152
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
SelectType< IsExpression< MT2 >::value, const MT2, const MT2 & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:254
size_t rows() const
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:369
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the ColumnExprTrait class template.
DMatDMatMultExpr< MT1, MT2 > This
Type of this DMatDMatMultExpr instance.
Definition: DMatDMatMultExpr.h:241
Header file for the IsSame and IsStrictlySame type traits.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:245
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Header file for the IsRowVector type trait.
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the And class template.
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
CompressedMatrix< Type,!SO > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:257
SelectType< IsExpression< MT1 >::value, const MT1, const MT1 & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:251
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:144
Header file for the IsUniLower type trait.
CompressedMatrix< Type, false > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: CompressedMatrix.h:2584
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
Header file for the IsComplexDouble type trait.
RT2::ElementType ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:153
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:389
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
Header file for the Or class template.
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDMatMultExpr.h:246
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:433
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
const MT::ElementType min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1682
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs)
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:282
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:443
Header file for the IsDenseMatrix type trait.
ResultType::OppositeType OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:243
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:411
Header file for the serial shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:353
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:165
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
RightOperand rightOperand() const
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:399
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:297
System settings for the BLAS mode.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:452
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:65
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the IsSparseVector type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Header file for the MatScalarMultExpr base class.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MultTrait< RT1, RT2 >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:242
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Utility type for generic codes.
Base template for the MultTrait class.
Definition: MultTrait.h:138
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:248
Constraint on the data type.
Constraints on the storage order of matrix types.
size_t columns() const
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:379
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:244
Header file for the HasMutableDataAccess type trait.
SelectType< evaluateRight, const RT2, CT2 >::Type RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:260
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:122
Header file for the IsDenseVector type trait.
Header file for all intrinsic functionality.
SelectType< evaluateLeft, const RT1, CT1 >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:257
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:247
Header file for the IsRowMajorMatrix type trait.
const DMatTransExpr< MT,!SO > trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:944
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the TDVecDMatMultExprTrait class template.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for BLAS general matrix/matrix multiplication functions (gemm)
MT1::ResultType RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:150
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
Header file for the IsTrue value trait.
Header file for the IsComplex type trait.
Header file for the TSVecDMatMultExprTrait class template.
Header file for the complex data type.
MT1::CompositeType CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:154
Header file for the IsUpper type trait.
Header file for exception macros.
MT2::ResultType RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:151
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:453
Header file for the IsResizable type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:423
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.