35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
146 template<
typename MT1
148 class TDMatDMatMultExpr :
public DenseMatrix< TDMatDMatMultExpr<MT1,MT2>, true >
149 ,
private MatMatMultExpr
150 ,
private Computation
178 template<
typename T1,
typename T2,
typename T3 >
179 struct IsEvaluationRequired {
180 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
190 template<
typename T1,
typename T2,
typename T3 >
191 struct UseBlasKernel {
193 HasMutableDataAccess<T1>::value &&
194 HasConstDataAccess<T2>::value &&
195 HasConstDataAccess<T3>::value &&
196 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
197 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
198 IsBLASCompatible< ElementType_<T1> >::value &&
199 IsBLASCompatible< ElementType_<T2> >::value &&
200 IsBLASCompatible< ElementType_<T3> >::value &&
201 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
202 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
212 template<
typename T1,
typename T2,
typename T3 >
213 struct UseVectorizedDefaultKernel {
215 !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
216 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
217 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 AreSIMDCombinable< ElementType_<T1>
221 , ElementType_<T3> >::value &&
222 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
223 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
255 MT1::simdEnabled && MT2::simdEnabled &&
260 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
261 !evaluateRight && MT2::smpAssignable };
316 :(
lhs_.columns() ) ) );
320 const size_t n(
end - begin );
338 inline ReturnType
at(
size_t i,
size_t j )
const {
339 if( i >=
lhs_.rows() ) {
342 if( j >=
rhs_.columns() ) {
354 inline size_t rows() const noexcept {
365 return rhs_.columns();
395 template<
typename T >
396 inline bool canAlias(
const T* alias )
const noexcept {
397 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
407 template<
typename T >
408 inline bool isAliased(
const T* alias )
const noexcept {
409 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
419 return lhs_.isAligned() &&
rhs_.isAligned();
430 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
431 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
454 template<
typename MT
463 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
466 else if( rhs.lhs_.columns() == 0UL ) {
471 LT A(
serial( rhs.lhs_ ) );
472 RT B(
serial( rhs.rhs_ ) );
481 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
497 template<
typename MT3
500 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
503 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
504 selectSmallAssignKernel( C, A, B );
506 selectBlasAssignKernel( C, A, B );
525 template<
typename MT3
528 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
529 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
531 const size_t M( A.rows() );
532 const size_t N( B.columns() );
533 const size_t K( A.columns() );
535 for(
size_t i=0UL; i<M; ++i )
537 const size_t kbegin( ( IsUpper<MT4>::value )
538 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
540 const size_t kend( ( IsLower<MT4>::value )
541 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
545 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
546 for(
size_t j=0UL; j<N; ++j ) {
553 const size_t jbegin( ( IsUpper<MT5>::value )
554 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
556 const size_t jend( ( IsLower<MT5>::value )
557 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
561 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
562 for(
size_t j=0UL; j<jbegin; ++j ) {
566 else if( IsStrictlyUpper<MT5>::value ) {
567 reset( (~C)(i,0UL) );
569 for(
size_t j=jbegin; j<jend; ++j ) {
570 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
572 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
573 for(
size_t j=jend; j<N; ++j ) {
577 else if( IsStrictlyLower<MT5>::value ) {
578 reset( (~C)(i,N-1UL) );
582 for(
size_t k=kbegin+1UL; k<kend; ++k )
584 const size_t jbegin( ( IsUpper<MT5>::value )
585 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
587 const size_t jend( ( IsLower<MT5>::value )
588 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
592 for(
size_t j=jbegin; j<jend; ++j ) {
593 (~C)(i,j) += A(i,k) * B(k,j);
595 if( IsLower<MT5>::value ) {
596 (~C)(i,jend) = A(i,k) * B(k,jend);
618 template<
typename MT3
621 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
622 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
624 const size_t M( A.rows() );
625 const size_t N( B.columns() );
626 const size_t K( A.columns() );
628 for(
size_t j=0UL; j<N; ++j )
630 const size_t kbegin( ( IsLower<MT5>::value )
631 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
633 const size_t kend( ( IsUpper<MT5>::value )
634 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
638 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
639 for(
size_t i=0UL; i<M; ++i ) {
646 const size_t ibegin( ( IsLower<MT4>::value )
647 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
649 const size_t iend( ( IsUpper<MT4>::value )
650 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
654 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
655 for(
size_t i=0UL; i<ibegin; ++i ) {
659 else if( IsStrictlyLower<MT4>::value ) {
660 reset( (~C)(0UL,j) );
662 for(
size_t i=ibegin; i<iend; ++i ) {
663 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
665 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
666 for(
size_t i=iend; i<M; ++i ) {
670 else if( IsStrictlyUpper<MT4>::value ) {
671 reset( (~C)(M-1UL,j) );
675 for(
size_t k=kbegin+1UL; k<kend; ++k )
677 const size_t ibegin( ( IsLower<MT4>::value )
678 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
680 const size_t iend( ( IsUpper<MT4>::value )
681 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
685 for(
size_t i=ibegin; i<iend; ++i ) {
686 (~C)(i,j) += A(i,k) * B(k,j);
688 if( IsUpper<MT4>::value ) {
689 (~C)(iend,j) = A(iend,k) * B(k,j);
711 template<
typename MT3
714 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
715 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
717 const size_t M( A.rows() );
718 const size_t N( B.columns() );
720 const size_t block( BLOCK_SIZE );
722 for(
size_t ii=0UL; ii<M; ii+=block ) {
723 const size_t iend(
min( M, ii+block ) );
724 for(
size_t jj=0UL; jj<N; jj+=block ) {
725 const size_t jend(
min( N, jj+block ) );
726 for(
size_t i=ii; i<iend; ++i )
728 const size_t jbegin( ( IsUpper<MT4>::value )
729 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
731 const size_t jpos( ( IsLower<MT4>::value )
732 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
735 if( IsUpper<MT4>::value ) {
736 for(
size_t j=jj; j<jbegin; ++j ) {
740 for(
size_t j=jbegin; j<jpos; ++j ) {
741 (~C)(i,j) = A(i,j) * B(j,j);
743 if( IsLower<MT4>::value ) {
744 for(
size_t j=jpos; j<jend; ++j ) {
769 template<
typename MT3
772 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
773 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
775 const size_t M( A.rows() );
776 const size_t N( B.columns() );
778 for(
size_t j=0UL; j<N; ++j )
780 const size_t ibegin( ( IsLower<MT4>::value )
781 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
783 const size_t iend( ( IsUpper<MT4>::value )
784 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
788 if( IsLower<MT4>::value ) {
789 for(
size_t i=0UL; i<ibegin; ++i ) {
793 for(
size_t i=ibegin; i<iend; ++i ) {
794 (~C)(i,j) = A(i,j) * B(j,j);
796 if( IsUpper<MT4>::value ) {
797 for(
size_t i=iend; i<M; ++i ) {
820 template<
typename MT3
823 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
824 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
826 const size_t M( A.rows() );
827 const size_t N( B.columns() );
829 for(
size_t i=0UL; i<M; ++i )
831 const size_t jbegin( ( IsUpper<MT5>::value )
832 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
834 const size_t jend( ( IsLower<MT5>::value )
835 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
839 if( IsUpper<MT5>::value ) {
840 for(
size_t j=0UL; j<jbegin; ++j ) {
844 for(
size_t j=jbegin; j<jend; ++j ) {
845 (~C)(i,j) = A(i,i) * B(i,j);
847 if( IsLower<MT5>::value ) {
848 for(
size_t j=jend; j<N; ++j ) {
871 template<
typename MT3
874 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
875 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
877 const size_t M( A.rows() );
878 const size_t N( B.columns() );
880 const size_t block( BLOCK_SIZE );
882 for(
size_t jj=0UL; jj<N; jj+=block ) {
883 const size_t jend(
min( N, jj+block ) );
884 for(
size_t ii=0UL; ii<M; ii+=block ) {
885 const size_t iend(
min( M, ii+block ) );
886 for(
size_t j=jj; j<jend; ++j )
888 const size_t ibegin( ( IsLower<MT5>::value )
889 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
891 const size_t ipos( ( IsUpper<MT5>::value )
892 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
895 if( IsLower<MT5>::value ) {
896 for(
size_t i=ii; i<ibegin; ++i ) {
900 for(
size_t i=ibegin; i<ipos; ++i ) {
901 (~C)(i,j) = A(i,i) * B(i,j);
903 if( IsUpper<MT5>::value ) {
904 for(
size_t i=ipos; i<iend; ++i ) {
929 template<
typename MT3
932 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
933 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
937 for(
size_t i=0UL; i<A.rows(); ++i ) {
938 C(i,i) = A(i,i) * B(i,i);
958 template<
typename MT3
961 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
962 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
964 selectDefaultAssignKernel( ~C, A, B );
984 template<
typename MT3
987 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
988 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
990 const size_t M( A.rows() );
991 const size_t N( B.columns() );
992 const size_t K( A.columns() );
994 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
996 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1001 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
1002 for(
size_t i=0UL; i<M; ++i )
1004 const size_t kbegin( ( IsUpper<MT4>::value )
1005 ?( ( IsLower<MT5>::value )
1006 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1007 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1008 :( IsLower<MT5>::value ? j : 0UL ) );
1009 const size_t kend( ( IsLower<MT4>::value )
1010 ?( ( IsUpper<MT5>::value )
1011 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
1012 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
1013 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
1015 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1017 for(
size_t k=kbegin; k<kend; ++k ) {
1018 const SIMDType a1(
set( A(i,k) ) );
1019 xmm1 = xmm1 + a1 * B.load(k,j );
1020 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1021 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1022 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1023 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
1024 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
1025 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
1026 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
1029 (~C).store( i, j , xmm1 );
1030 (~C).store( i, j+SIMDSIZE , xmm2 );
1031 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1032 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1033 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
1034 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
1035 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
1036 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
1040 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1044 for( ; (i+2UL) <= M; i+=2UL )
1046 const size_t kbegin( ( IsUpper<MT4>::value )
1047 ?( ( IsLower<MT5>::value )
1048 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1049 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1050 :( IsLower<MT5>::value ? j : 0UL ) );
1051 const size_t kend( ( IsLower<MT4>::value )
1052 ?( ( IsUpper<MT5>::value )
1053 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
1054 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1055 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
1057 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1059 for(
size_t k=kbegin; k<kend; ++k ) {
1060 const SIMDType a1(
set( A(i ,k) ) );
1061 const SIMDType a2(
set( A(i+1UL,k) ) );
1062 const SIMDType b1( B.load(k,j ) );
1063 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
1064 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
1065 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
1066 xmm1 = xmm1 + a1 * b1;
1067 xmm2 = xmm2 + a1 * b2;
1068 xmm3 = xmm3 + a1 * b3;
1069 xmm4 = xmm4 + a1 * b4;
1070 xmm5 = xmm5 + a2 * b1;
1071 xmm6 = xmm6 + a2 * b2;
1072 xmm7 = xmm7 + a2 * b3;
1073 xmm8 = xmm8 + a2 * b4;
1076 (~C).store( i , j , xmm1 );
1077 (~C).store( i , j+SIMDSIZE , xmm2 );
1078 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
1079 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
1080 (~C).store( i+1UL, j , xmm5 );
1081 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
1082 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
1083 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
1088 const size_t kbegin( ( IsUpper<MT4>::value )
1089 ?( ( IsLower<MT5>::value )
1090 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1091 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1092 :( IsLower<MT5>::value ? j : 0UL ) );
1093 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
1095 SIMDType xmm1, xmm2, xmm3, xmm4;
1097 for(
size_t k=kbegin; k<kend; ++k ) {
1098 const SIMDType a1(
set( A(i,k) ) );
1099 xmm1 = xmm1 + a1 * B.load(k,j );
1100 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
1101 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
1102 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
1105 (~C).store( i, j , xmm1 );
1106 (~C).store( i, j+SIMDSIZE , xmm2 );
1107 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
1108 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
1112 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1116 for( ; (i+2UL) <= M; i+=2UL )
1118 const size_t kbegin( ( IsUpper<MT4>::value )
1119 ?( ( IsLower<MT5>::value )
1120 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1121 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1122 :( IsLower<MT5>::value ? j : 0UL ) );
1123 const size_t kend( ( IsLower<MT4>::value )
1124 ?( ( IsUpper<MT5>::value )
1125 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
1126 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
1127 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
1129 SIMDType xmm1, xmm2, xmm3, xmm4;
1131 for(
size_t k=kbegin; k<kend; ++k ) {
1132 const SIMDType a1(
set( A(i ,k) ) );
1133 const SIMDType a2(
set( A(i+1UL,k) ) );
1134 const SIMDType b1( B.load(k,j ) );
1135 const SIMDType b2( B.load(k,j+SIMDSIZE) );
1136 xmm1 = xmm1 + a1 * b1;
1137 xmm2 = xmm2 + a1 * b2;
1138 xmm3 = xmm3 + a2 * b1;
1139 xmm4 = xmm4 + a2 * b2;
1142 (~C).store( i , j , xmm1 );
1143 (~C).store( i , j+SIMDSIZE, xmm2 );
1144 (~C).store( i+1UL, j , xmm3 );
1145 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
1150 const size_t kbegin( ( IsUpper<MT4>::value )
1151 ?( ( IsLower<MT5>::value )
1152 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1153 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1154 :( IsLower<MT5>::value ? j : 0UL ) );
1155 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
1157 SIMDType xmm1, xmm2;
1159 for(
size_t k=kbegin; k<kend; ++k ) {
1160 const SIMDType a1(
set( A(i,k) ) );
1161 xmm1 = xmm1 + a1 * B.load(k,j );
1162 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
1165 (~C).store( i, j , xmm1 );
1166 (~C).store( i, j+SIMDSIZE, xmm2 );
1170 for( ; j<jpos; j+=SIMDSIZE )
1174 for( ; (i+2UL) <= M; i+=2UL )
1176 const size_t kbegin( ( IsUpper<MT4>::value )
1177 ?( ( IsLower<MT5>::value )
1178 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1179 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1180 :( IsLower<MT5>::value ? j : 0UL ) );
1181 const size_t kend( ( IsLower<MT4>::value )
1182 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1185 SIMDType xmm1, xmm2;
1187 for(
size_t k=kbegin; k<kend; ++k ) {
1188 const SIMDType b1( B.load(k,j) );
1189 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
1190 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
1193 (~C).store( i , j, xmm1 );
1194 (~C).store( i+1UL, j, xmm2 );
1199 const size_t kbegin( ( IsUpper<MT4>::value )
1200 ?( ( IsLower<MT5>::value )
1201 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1202 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1203 :( IsLower<MT5>::value ? j : 0UL ) );
1207 for(
size_t k=kbegin; k<K; ++k ) {
1208 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
1211 (~C).store( i, j, xmm1 );
1215 for( ; remainder && j<N; ++j )
1219 for( ; (i+2UL) <= M; i+=2UL )
1221 const size_t kbegin( ( IsUpper<MT4>::value )
1222 ?( ( IsLower<MT5>::value )
1223 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1224 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1225 :( IsLower<MT5>::value ? j : 0UL ) );
1226 const size_t kend( ( IsLower<MT4>::value )
1227 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
1233 for(
size_t k=kbegin; k<kend; ++k ) {
1234 value1 += A(i ,k) * B(k,j);
1235 value2 += A(i+1UL,k) * B(k,j);
1238 (~C)(i ,j) = value1;
1239 (~C)(i+1UL,j) = value2;
1244 const size_t kbegin( ( IsUpper<MT4>::value )
1245 ?( ( IsLower<MT5>::value )
1246 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
1247 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
1248 :( IsLower<MT5>::value ? j : 0UL ) );
1252 for(
size_t k=kbegin; k<K; ++k ) {
1253 value += A(i,k) * B(k,j);
1278 template<
typename MT3
1281 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1282 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1284 const size_t M( A.rows() );
1285 const size_t N( B.columns() );
1286 const size_t K( A.columns() );
1288 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1290 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1295 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
1296 for(
size_t j=0UL; j<N; ++j )
1298 const size_t kbegin( ( IsLower<MT5>::value )
1299 ?( ( IsUpper<MT4>::value )
1300 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1301 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1302 :( IsUpper<MT4>::value ? i : 0UL ) );
1303 const size_t kend( ( IsUpper<MT5>::value )
1304 ?( ( IsLower<MT4>::value )
1305 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
1306 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
1307 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
1309 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1311 for(
size_t k=kbegin; k<kend; ++k ) {
1312 const SIMDType b1(
set( B(k,j) ) );
1313 xmm1 = xmm1 + A.load(i ,k) * b1;
1314 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1315 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1316 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1317 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
1318 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
1319 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
1320 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
1323 (~C).store( i , j, xmm1 );
1324 (~C).store( i+SIMDSIZE , j, xmm2 );
1325 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1326 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1327 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
1328 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
1329 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
1330 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
1334 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1338 for( ; (j+2UL) <= N; j+=2UL )
1340 const size_t kbegin( ( IsLower<MT5>::value )
1341 ?( ( IsUpper<MT4>::value )
1342 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1343 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1344 :( IsUpper<MT4>::value ? i : 0UL ) );
1345 const size_t kend( ( IsUpper<MT5>::value )
1346 ?( ( IsLower<MT4>::value )
1347 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1348 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1349 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
1351 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1353 for(
size_t k=kbegin; k<kend; ++k ) {
1354 const SIMDType a1( A.load(i ,k) );
1355 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
1356 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
1357 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
1358 const SIMDType b1(
set( B(k,j ) ) );
1359 const SIMDType b2(
set( B(k,j+1UL) ) );
1360 xmm1 = xmm1 + a1 * b1;
1361 xmm2 = xmm2 + a2 * b1;
1362 xmm3 = xmm3 + a3 * b1;
1363 xmm4 = xmm4 + a4 * b1;
1364 xmm5 = xmm5 + a1 * b2;
1365 xmm6 = xmm6 + a2 * b2;
1366 xmm7 = xmm7 + a3 * b2;
1367 xmm8 = xmm8 + a4 * b2;
1370 (~C).store( i , j , xmm1 );
1371 (~C).store( i+SIMDSIZE , j , xmm2 );
1372 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
1373 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
1374 (~C).store( i , j+1UL, xmm5 );
1375 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
1376 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
1377 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
1382 const size_t kbegin( ( IsLower<MT5>::value )
1383 ?( ( IsUpper<MT4>::value )
1384 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1385 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1386 :( IsUpper<MT4>::value ? i : 0UL ) );
1387 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
1389 SIMDType xmm1, xmm2, xmm3, xmm4;
1391 for(
size_t k=kbegin; k<kend; ++k ) {
1392 const SIMDType b1(
set( B(k,j) ) );
1393 xmm1 = xmm1 + A.load(i ,k) * b1;
1394 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
1395 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
1396 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
1399 (~C).store( i , j, xmm1 );
1400 (~C).store( i+SIMDSIZE , j, xmm2 );
1401 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
1402 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
1406 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1410 for( ; (j+2UL) <= N; j+=2UL )
1412 const size_t kbegin( ( IsLower<MT5>::value )
1413 ?( ( IsUpper<MT4>::value )
1414 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1415 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1416 :( IsUpper<MT4>::value ? i : 0UL ) );
1417 const size_t kend( ( IsUpper<MT5>::value )
1418 ?( ( IsLower<MT4>::value )
1419 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
1420 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
1421 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
1423 SIMDType xmm1, xmm2, xmm3, xmm4;
1425 for(
size_t k=kbegin; k<kend; ++k ) {
1426 const SIMDType a1( A.load(i ,k) );
1427 const SIMDType a2( A.load(i+SIMDSIZE,k) );
1428 const SIMDType b1(
set( B(k,j ) ) );
1429 const SIMDType b2(
set( B(k,j+1UL) ) );
1430 xmm1 = xmm1 + a1 * b1;
1431 xmm2 = xmm2 + a2 * b1;
1432 xmm3 = xmm3 + a1 * b2;
1433 xmm4 = xmm4 + a2 * b2;
1436 (~C).store( i , j , xmm1 );
1437 (~C).store( i+SIMDSIZE, j , xmm2 );
1438 (~C).store( i , j+1UL, xmm3 );
1439 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
1444 const size_t kbegin( ( IsLower<MT5>::value )
1445 ?( ( IsUpper<MT4>::value )
1446 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1447 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1448 :( IsUpper<MT4>::value ? i : 0UL ) );
1449 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
1451 SIMDType xmm1, xmm2;
1453 for(
size_t k=kbegin; k<kend; ++k ) {
1454 const SIMDType b1(
set( B(k,j) ) );
1455 xmm1 = xmm1 + A.load(i ,k) * b1;
1456 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
1459 (~C).store( i , j, xmm1 );
1460 (~C).store( i+SIMDSIZE, j, xmm2 );
1464 for( ; i<ipos; i+=SIMDSIZE )
1468 for( ; (j+2UL) <= N; j+=2UL )
1470 const size_t kbegin( ( IsLower<MT5>::value )
1471 ?( ( IsUpper<MT4>::value )
1472 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1473 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1474 :( IsUpper<MT4>::value ? i : 0UL ) );
1475 const size_t kend( ( IsUpper<MT5>::value )
1476 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1479 SIMDType xmm1, xmm2;
1481 for(
size_t k=kbegin; k<kend; ++k ) {
1482 const SIMDType a1( A.load(i,k) );
1483 xmm1 = xmm1 + a1 *
set( B(k,j ) );
1484 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
1487 (~C).store( i, j , xmm1 );
1488 (~C).store( i, j+1UL, xmm2 );
1493 const size_t kbegin( ( IsLower<MT5>::value )
1494 ?( ( IsUpper<MT4>::value )
1495 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1496 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1497 :( IsUpper<MT4>::value ? i : 0UL ) );
1501 for(
size_t k=kbegin; k<K; ++k ) {
1502 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
1505 (~C).store( i, j, xmm1 );
1509 for( ; remainder && i<M; ++i )
1513 for( ; (j+2UL) <= N; j+=2UL )
1515 const size_t kbegin( ( IsLower<MT5>::value )
1516 ?( ( IsUpper<MT4>::value )
1517 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1518 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1519 :( IsUpper<MT4>::value ? i : 0UL ) );
1520 const size_t kend( ( IsUpper<MT5>::value )
1521 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
1527 for(
size_t k=kbegin; k<kend; ++k ) {
1528 value1 += A(i,k) * B(k,j );
1529 value2 += A(i,k) * B(k,j+1UL);
1532 (~C)(i,j ) = value1;
1533 (~C)(i,j+1UL) = value2;
1538 const size_t kbegin( ( IsLower<MT5>::value )
1539 ?( ( IsUpper<MT4>::value )
1540 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
1541 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
1542 :( IsUpper<MT4>::value ? i : 0UL ) );
1546 for(
size_t k=kbegin; k<K; ++k ) {
1547 value += A(i,k) * B(k,j);
1571 template<
typename MT3
1574 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1575 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1577 selectDefaultAssignKernel( C, A, B );
1597 template<
typename MT3
1600 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1601 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
1603 const size_t M( A.rows() );
1604 const size_t N( B.columns() );
1605 const size_t K( A.columns() );
1607 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
1609 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
1611 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1613 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1614 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1616 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
1618 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1620 for(
size_t i=ii; i<iend; ++i ) {
1621 for(
size_t j=jj; j<jend; ++j ) {
1626 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
1628 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1632 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1634 const size_t j1( j+SIMDSIZE );
1635 const size_t j2( j+SIMDSIZE*2UL );
1636 const size_t j3( j+SIMDSIZE*3UL );
1640 for( ; (i+2UL) <= iend; i+=2UL )
1642 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1643 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1644 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1645 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1647 SIMDType xmm1( (~C).load(i ,j ) );
1648 SIMDType xmm2( (~C).load(i ,j1) );
1649 SIMDType xmm3( (~C).load(i ,j2) );
1650 SIMDType xmm4( (~C).load(i ,j3) );
1651 SIMDType xmm5( (~C).load(i+1UL,j ) );
1652 SIMDType xmm6( (~C).load(i+1UL,j1) );
1653 SIMDType xmm7( (~C).load(i+1UL,j2) );
1654 SIMDType xmm8( (~C).load(i+1UL,j3) );
1656 for(
size_t k=kbegin; k<kend; ++k ) {
1657 const SIMDType a1(
set( A(i ,k) ) );
1658 const SIMDType a2(
set( A(i+1UL,k) ) );
1659 const SIMDType b1( B.load(k,j ) );
1660 const SIMDType b2( B.load(k,j1) );
1661 const SIMDType b3( B.load(k,j2) );
1662 const SIMDType b4( B.load(k,j3) );
1663 xmm1 = xmm1 + a1 * b1;
1664 xmm2 = xmm2 + a1 * b2;
1665 xmm3 = xmm3 + a1 * b3;
1666 xmm4 = xmm4 + a1 * b4;
1667 xmm5 = xmm5 + a2 * b1;
1668 xmm6 = xmm6 + a2 * b2;
1669 xmm7 = xmm7 + a2 * b3;
1670 xmm8 = xmm8 + a2 * b4;
1673 (~C).store( i , j , xmm1 );
1674 (~C).store( i , j1, xmm2 );
1675 (~C).store( i , j2, xmm3 );
1676 (~C).store( i , j3, xmm4 );
1677 (~C).store( i+1UL, j , xmm5 );
1678 (~C).store( i+1UL, j1, xmm6 );
1679 (~C).store( i+1UL, j2, xmm7 );
1680 (~C).store( i+1UL, j3, xmm8 );
1685 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1686 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1687 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1688 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
1690 SIMDType xmm1( (~C).load(i,j ) );
1691 SIMDType xmm2( (~C).load(i,j1) );
1692 SIMDType xmm3( (~C).load(i,j2) );
1693 SIMDType xmm4( (~C).load(i,j3) );
1695 for(
size_t k=kbegin; k<kend; ++k ) {
1696 const SIMDType a1(
set( A(i,k) ) );
1697 xmm1 = xmm1 + a1 * B.load(k,j );
1698 xmm2 = xmm2 + a1 * B.load(k,j1);
1699 xmm3 = xmm3 + a1 * B.load(k,j2);
1700 xmm4 = xmm4 + a1 * B.load(k,j3);
1703 (~C).store( i, j , xmm1 );
1704 (~C).store( i, j1, xmm2 );
1705 (~C).store( i, j2, xmm3 );
1706 (~C).store( i, j3, xmm4 );
1710 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1712 const size_t j1( j+SIMDSIZE );
1716 for( ; (i+4UL) <= iend; i+=4UL )
1718 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1719 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1720 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
1721 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1723 SIMDType xmm1( (~C).load(i ,j ) );
1724 SIMDType xmm2( (~C).load(i ,j1) );
1725 SIMDType xmm3( (~C).load(i+1UL,j ) );
1726 SIMDType xmm4( (~C).load(i+1UL,j1) );
1727 SIMDType xmm5( (~C).load(i+2UL,j ) );
1728 SIMDType xmm6( (~C).load(i+2UL,j1) );
1729 SIMDType xmm7( (~C).load(i+3UL,j ) );
1730 SIMDType xmm8( (~C).load(i+3UL,j1) );
1732 for(
size_t k=kbegin; k<kend; ++k ) {
1733 const SIMDType a1(
set( A(i ,k) ) );
1734 const SIMDType a2(
set( A(i+1UL,k) ) );
1735 const SIMDType a3(
set( A(i+2UL,k) ) );
1736 const SIMDType a4(
set( A(i+3UL,k) ) );
1737 const SIMDType b1( B.load(k,j ) );
1738 const SIMDType b2( B.load(k,j1) );
1739 xmm1 = xmm1 + a1 * b1;
1740 xmm2 = xmm2 + a1 * b2;
1741 xmm3 = xmm3 + a2 * b1;
1742 xmm4 = xmm4 + a2 * b2;
1743 xmm5 = xmm5 + a3 * b1;
1744 xmm6 = xmm6 + a3 * b2;
1745 xmm7 = xmm7 + a4 * b1;
1746 xmm8 = xmm8 + a4 * b2;
1749 (~C).store( i , j , xmm1 );
1750 (~C).store( i , j1, xmm2 );
1751 (~C).store( i+1UL, j , xmm3 );
1752 (~C).store( i+1UL, j1, xmm4 );
1753 (~C).store( i+2UL, j , xmm5 );
1754 (~C).store( i+2UL, j1, xmm6 );
1755 (~C).store( i+3UL, j , xmm7 );
1756 (~C).store( i+3UL, j1, xmm8 );
1759 for( ; (i+2UL) <= iend; i+=2UL )
1761 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1762 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1763 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
1764 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1766 SIMDType xmm1( (~C).load(i ,j ) );
1767 SIMDType xmm2( (~C).load(i ,j1) );
1768 SIMDType xmm3( (~C).load(i+1UL,j ) );
1769 SIMDType xmm4( (~C).load(i+1UL,j1) );
1771 for(
size_t k=kbegin; k<kend; ++k ) {
1772 const SIMDType a1(
set( A(i ,k) ) );
1773 const SIMDType a2(
set( A(i+1UL,k) ) );
1774 const SIMDType b1( B.load(k,j ) );
1775 const SIMDType b2( B.load(k,j1) );
1776 xmm1 = xmm1 + a1 * b1;
1777 xmm2 = xmm2 + a1 * b2;
1778 xmm3 = xmm3 + a2 * b1;
1779 xmm4 = xmm4 + a2 * b2;
1782 (~C).store( i , j , xmm1 );
1783 (~C).store( i , j1, xmm2 );
1784 (~C).store( i+1UL, j , xmm3 );
1785 (~C).store( i+1UL, j1, xmm4 );
1790 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1791 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1792 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1793 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
1795 SIMDType xmm1( (~C).load(i,j ) );
1796 SIMDType xmm2( (~C).load(i,j1) );
1798 for(
size_t k=kbegin; k<kend; ++k ) {
1799 const SIMDType a1(
set( A(i,k) ) );
1800 xmm1 = xmm1 + a1 * B.load(k,j );
1801 xmm2 = xmm2 + a1 * B.load(k,j1);
1804 (~C).store( i, j , xmm1 );
1805 (~C).store( i, j1, xmm2 );
1809 for( ; j<jpos; j+=SIMDSIZE )
1811 for(
size_t i=ii; i<iend; ++i )
1813 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1814 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1815 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1816 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
1818 SIMDType xmm1( (~C).load(i,j) );
1820 for(
size_t k=kbegin; k<kend; ++k ) {
1821 const SIMDType a1(
set( A(i,k) ) );
1822 xmm1 = xmm1 + a1 * B.load(k,j);
1825 (~C).store( i, j, xmm1 );
1829 for( ; remainder && j<jend; ++j )
1831 for(
size_t i=ii; i<iend; ++i )
1833 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1834 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1835 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
1836 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
1838 ElementType value( (~C)(i,j) );
1840 for(
size_t k=kbegin; k<kend; ++k ) {
1841 value += A(i,k) * B(k,j);
1869 template<
typename MT3
1872 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
1873 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
1875 const size_t M( A.rows() );
1876 const size_t N( B.columns() );
1877 const size_t K( A.columns() );
1879 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
1881 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
1883 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
1885 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1886 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1888 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
1890 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
1892 for(
size_t j=jj; j<jend; ++j ) {
1893 for(
size_t i=ii; i<iend; ++i ) {
1898 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
1900 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
1904 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1906 const size_t i1( i+SIMDSIZE );
1907 const size_t i2( i+SIMDSIZE*2UL );
1908 const size_t i3( i+SIMDSIZE*3UL );
1912 for( ; (j+2UL) <= jend; j+=2UL )
1914 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1915 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1916 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1917 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
1919 SIMDType xmm1( (~C).load(i ,j ) );
1920 SIMDType xmm2( (~C).load(i1,j ) );
1921 SIMDType xmm3( (~C).load(i2,j ) );
1922 SIMDType xmm4( (~C).load(i3,j ) );
1923 SIMDType xmm5( (~C).load(i ,j+1UL) );
1924 SIMDType xmm6( (~C).load(i1,j+1UL) );
1925 SIMDType xmm7( (~C).load(i2,j+1UL) );
1926 SIMDType xmm8( (~C).load(i3,j+1UL) );
1928 for(
size_t k=kbegin; k<kend; ++k ) {
1929 const SIMDType a1( A.load(i ,k) );
1930 const SIMDType a2( A.load(i1,k) );
1931 const SIMDType a3( A.load(i2,k) );
1932 const SIMDType a4( A.load(i3,k) );
1933 const SIMDType b1(
set( B(k,j ) ) );
1934 const SIMDType b2(
set( B(k,j+1UL) ) );
1935 xmm1 = xmm1 + a1 * b1;
1936 xmm2 = xmm2 + a2 * b1;
1937 xmm3 = xmm3 + a3 * b1;
1938 xmm4 = xmm4 + a4 * b1;
1939 xmm5 = xmm5 + a1 * b2;
1940 xmm6 = xmm6 + a2 * b2;
1941 xmm7 = xmm7 + a3 * b2;
1942 xmm8 = xmm8 + a4 * b2;
1945 (~C).store( i , j , xmm1 );
1946 (~C).store( i1, j , xmm2 );
1947 (~C).store( i2, j , xmm3 );
1948 (~C).store( i3, j , xmm4 );
1949 (~C).store( i , j+1UL, xmm5 );
1950 (~C).store( i1, j+1UL, xmm6 );
1951 (~C).store( i2, j+1UL, xmm7 );
1952 (~C).store( i3, j+1UL, xmm8 );
1957 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1958 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1959 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
1960 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
1962 SIMDType xmm1( (~C).load(i ,j) );
1963 SIMDType xmm2( (~C).load(i1,j) );
1964 SIMDType xmm3( (~C).load(i2,j) );
1965 SIMDType xmm4( (~C).load(i3,j) );
1967 for(
size_t k=kbegin; k<kend; ++k ) {
1968 const SIMDType b1(
set( B(k,j) ) );
1969 xmm1 = xmm1 + A.load(i ,k) * b1;
1970 xmm2 = xmm2 + A.load(i1,k) * b1;
1971 xmm3 = xmm3 + A.load(i2,k) * b1;
1972 xmm4 = xmm4 + A.load(i3,k) * b1;
1975 (~C).store( i , j, xmm1 );
1976 (~C).store( i1, j, xmm2 );
1977 (~C).store( i2, j, xmm3 );
1978 (~C).store( i3, j, xmm4 );
1982 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1984 const size_t i1( i+SIMDSIZE );
1988 for( ; (j+4UL) <= jend; j+=4UL )
1990 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
1991 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
1992 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
1993 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
1995 SIMDType xmm1( (~C).load(i ,j ) );
1996 SIMDType xmm2( (~C).load(i1,j ) );
1997 SIMDType xmm3( (~C).load(i ,j+1UL) );
1998 SIMDType xmm4( (~C).load(i1,j+1UL) );
1999 SIMDType xmm5( (~C).load(i ,j+2UL) );
2000 SIMDType xmm6( (~C).load(i1,j+2UL) );
2001 SIMDType xmm7( (~C).load(i ,j+3UL) );
2002 SIMDType xmm8( (~C).load(i1,j+3UL) );
2004 for(
size_t k=kbegin; k<kend; ++k ) {
2005 const SIMDType a1( A.load(i ,k) );
2006 const SIMDType a2( A.load(i1,k) );
2007 const SIMDType b1(
set( B(k,j ) ) );
2008 const SIMDType b2(
set( B(k,j+1UL) ) );
2009 const SIMDType b3(
set( B(k,j+2UL) ) );
2010 const SIMDType b4(
set( B(k,j+3UL) ) );
2011 xmm1 = xmm1 + a1 * b1;
2012 xmm2 = xmm2 + a2 * b1;
2013 xmm3 = xmm3 + a1 * b2;
2014 xmm4 = xmm4 + a2 * b2;
2015 xmm5 = xmm5 + a1 * b3;
2016 xmm6 = xmm6 + a2 * b3;
2017 xmm7 = xmm7 + a1 * b4;
2018 xmm8 = xmm8 + a2 * b4;
2021 (~C).store( i , j , xmm1 );
2022 (~C).store( i1, j , xmm2 );
2023 (~C).store( i , j+1UL, xmm3 );
2024 (~C).store( i1, j+1UL, xmm4 );
2025 (~C).store( i , j+2UL, xmm5 );
2026 (~C).store( i1, j+2UL, xmm6 );
2027 (~C).store( i , j+3UL, xmm7 );
2028 (~C).store( i1, j+3UL, xmm8 );
2031 for( ; (j+2UL) <= jend; j+=2UL )
2033 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2034 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2035 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2036 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
2038 SIMDType xmm1( (~C).load(i ,j ) );
2039 SIMDType xmm2( (~C).load(i1,j ) );
2040 SIMDType xmm3( (~C).load(i ,j+1UL) );
2041 SIMDType xmm4( (~C).load(i1,j+1UL) );
2043 for(
size_t k=kbegin; k<kend; ++k ) {
2044 const SIMDType a1( A.load(i ,k) );
2045 const SIMDType a2( A.load(i1,k) );
2046 const SIMDType b1(
set( B(k,j ) ) );
2047 const SIMDType b2(
set( B(k,j+1UL) ) );
2048 xmm1 = xmm1 + a1 * b1;
2049 xmm2 = xmm2 + a2 * b1;
2050 xmm3 = xmm3 + a1 * b2;
2051 xmm4 = xmm4 + a2 * b2;
2054 (~C).store( i , j , xmm1 );
2055 (~C).store( i1, j , xmm2 );
2056 (~C).store( i , j+1UL, xmm3 );
2057 (~C).store( i1, j+1UL, xmm4 );
2062 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2063 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2064 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
2065 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2067 SIMDType xmm1( (~C).load(i ,j) );
2068 SIMDType xmm2( (~C).load(i1,j) );
2070 for(
size_t k=kbegin; k<kend; ++k ) {
2071 const SIMDType b1(
set( B(k,j) ) );
2072 xmm1 = xmm1 + A.load(i ,k) * b1;
2073 xmm2 = xmm2 + A.load(i1,k) * b1;
2076 (~C).store( i , j, xmm1 );
2077 (~C).store( i1, j, xmm2 );
2081 for( ; i<ipos; i+=SIMDSIZE )
2083 for(
size_t j=jj; j<jend; ++j )
2085 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2086 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2087 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
2088 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2090 SIMDType xmm1( (~C).load(i,j) );
2092 for(
size_t k=kbegin; k<kend; ++k ) {
2093 const SIMDType b1(
set( B(k,j) ) );
2094 xmm1 = xmm1 + A.load(i,k) * b1;
2097 (~C).store( i, j, xmm1 );
2101 for( ; remainder && i<iend; ++i )
2103 for(
size_t j=jj; j<jend; ++j )
2105 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
2106 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
2107 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
2108 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
2110 ElementType value( (~C)(i,j) );
2112 for(
size_t k=kbegin; k<kend; ++k ) {
2113 value += A(i,k) * B(k,j);
2140 template<
typename MT3
2143 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
2144 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2146 selectLargeAssignKernel( C, A, B );
2152 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2166 template<
typename MT3
2169 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
2170 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2172 typedef ElementType_<MT3> ET;
2174 if( IsTriangular<MT4>::value ) {
2176 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2178 else if( IsTriangular<MT5>::value ) {
2180 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
2183 gemm( C, A, B, ET(1), ET(0) );
2203 template<
typename MT
2205 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2209 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
2221 const TmpType tmp(
serial( rhs ) );
2222 assign( ~lhs, tmp );
2240 template<
typename MT
2242 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2249 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2253 LT A(
serial( rhs.lhs_ ) );
2254 RT B(
serial( rhs.rhs_ ) );
2263 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2279 template<
typename MT3
2282 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2284 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
2285 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2286 selectSmallAddAssignKernel( C, A, B );
2288 selectBlasAddAssignKernel( C, A, B );
2307 template<
typename MT3
2310 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2311 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2313 const size_t M( A.rows() );
2314 const size_t N( B.columns() );
2315 const size_t K( A.columns() );
2317 for(
size_t i=0UL; i<M; ++i )
2319 const size_t kbegin( ( IsUpper<MT4>::value )
2320 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
2322 const size_t kend( ( IsLower<MT4>::value )
2323 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
2327 for(
size_t k=kbegin; k<kend; ++k )
2329 const size_t jbegin( ( IsUpper<MT5>::value )
2330 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
2332 const size_t jend( ( IsLower<MT5>::value )
2333 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
2337 const size_t jnum( jend - jbegin );
2338 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2340 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2341 (~C)(i,j ) += A(i,k) * B(k,j );
2342 (~C)(i,j+1UL) += A(i,k) * B(k,j+1UL);
2345 (~C)(i,jpos) += A(i,k) * B(k,jpos);
2367 template<
typename MT3
2370 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
2371 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2373 const size_t M( A.rows() );
2374 const size_t N( B.columns() );
2375 const size_t K( A.columns() );
2377 for(
size_t j=0UL; j<N; ++j )
2379 const size_t kbegin( ( IsLower<MT5>::value )
2380 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
2382 const size_t kend( ( IsUpper<MT5>::value )
2383 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
2387 for(
size_t k=kbegin; k<kend; ++k )
2389 const size_t ibegin( ( IsLower<MT4>::value )
2390 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
2392 const size_t iend( ( IsUpper<MT4>::value )
2393 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
2397 const size_t inum( iend - ibegin );
2398 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2400 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2401 (~C)(i ,j) += A(i ,k) * B(k,j);
2402 (~C)(i+1UL,j) += A(i+1UL,k) * B(k,j);
2405 (~C)(ipos,j) += A(ipos,k) * B(k,j);
2427 template<
typename MT3
2430 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2431 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2433 const size_t M( A.rows() );
2434 const size_t N( B.columns() );
2436 const size_t block( BLOCK_SIZE );
2438 for(
size_t ii=0UL; ii<M; ii+=block ) {
2439 const size_t iend(
min( M, ii+block ) );
2440 for(
size_t jj=0UL; jj<N; jj+=block ) {
2441 const size_t jend(
min( N, jj+block ) );
2442 for(
size_t i=ii; i<iend; ++i )
2444 const size_t jbegin( ( IsUpper<MT4>::value )
2445 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
2447 const size_t jpos( ( IsLower<MT4>::value )
2448 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
2451 for(
size_t j=jbegin; j<jpos; ++j ) {
2452 (~C)(i,j) += A(i,j) * B(j,j);
2475 template<
typename MT3
2478 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
2479 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2481 const size_t M( A.rows() );
2482 const size_t N( B.columns() );
2484 for(
size_t j=0UL; j<N; ++j )
2486 const size_t ibegin( ( IsLower<MT4>::value )
2487 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
2489 const size_t iend( ( IsUpper<MT4>::value )
2490 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
2494 const size_t inum( iend - ibegin );
2495 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2497 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2498 (~C)(i ,j) += A(i ,j) * B(j,j);
2499 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j);
2502 (~C)(ipos,j) += A(ipos,j) * B(j,j);
2523 template<
typename MT3
2526 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2527 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2529 const size_t M( A.rows() );
2530 const size_t N( B.columns() );
2532 for(
size_t i=0UL; i<M; ++i )
2534 const size_t jbegin( ( IsUpper<MT5>::value )
2535 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
2537 const size_t jend( ( IsLower<MT5>::value )
2538 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
2542 const size_t jnum( jend - jbegin );
2543 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2545 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2546 (~C)(i,j ) += A(i,i) * B(i,j );
2547 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL);
2550 (~C)(i,jpos) += A(i,i) * B(i,jpos);
2571 template<
typename MT3
2574 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
2575 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2577 const size_t M( A.rows() );
2578 const size_t N( B.columns() );
2580 const size_t block( BLOCK_SIZE );
2582 for(
size_t jj=0UL; jj<N; jj+=block ) {
2583 const size_t jend(
min( N, jj+block ) );
2584 for(
size_t ii=0UL; ii<M; ii+=block ) {
2585 const size_t iend(
min( M, ii+block ) );
2586 for(
size_t j=jj; j<jend; ++j )
2588 const size_t ibegin( ( IsLower<MT5>::value )
2589 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
2591 const size_t ipos( ( IsUpper<MT5>::value )
2592 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
2595 for(
size_t i=ibegin; i<ipos; ++i ) {
2596 (~C)(i,j) += A(i,i) * B(i,j);
2619 template<
typename MT3
2622 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
2623 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2625 for(
size_t i=0UL; i<A.rows(); ++i ) {
2626 C(i,i) += A(i,i) * B(i,i);
2646 template<
typename MT3
2649 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2650 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2652 selectDefaultAddAssignKernel( C, A, B );
2672 template<
typename MT3
2675 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2676 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
2678 const size_t M( A.rows() );
2679 const size_t N( B.columns() );
2680 const size_t K( A.columns() );
2682 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
2684 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2689 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
2690 for(
size_t i=0UL; i<M; ++i )
2692 const size_t kbegin( ( IsUpper<MT4>::value )
2693 ?( ( IsLower<MT5>::value )
2694 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2695 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2696 :( IsLower<MT5>::value ? j : 0UL ) );
2697 const size_t kend( ( IsLower<MT4>::value )
2698 ?( ( IsUpper<MT5>::value )
2699 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
2700 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
2701 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
2703 SIMDType xmm1( (~C).load(i,j ) );
2704 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2705 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2706 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2707 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
2708 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
2709 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
2710 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
2712 for(
size_t k=kbegin; k<kend; ++k ) {
2713 const SIMDType a1(
set( A(i,k) ) );
2714 xmm1 = xmm1 + a1 * B.load(k,j );
2715 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2716 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2717 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2718 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
2719 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
2720 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
2721 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
2724 (~C).store( i, j , xmm1 );
2725 (~C).store( i, j+SIMDSIZE , xmm2 );
2726 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2727 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2728 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
2729 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
2730 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
2731 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
2735 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2739 for( ; (i+2UL) <= M; i+=2UL )
2741 const size_t kbegin( ( IsUpper<MT4>::value )
2742 ?( ( IsLower<MT5>::value )
2743 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2744 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2745 :( IsLower<MT5>::value ? j : 0UL ) );
2746 const size_t kend( ( IsLower<MT4>::value )
2747 ?( ( IsUpper<MT5>::value )
2748 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
2749 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2750 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
2752 SIMDType xmm1( (~C).load(i ,j ) );
2753 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
2754 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
2755 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
2756 SIMDType xmm5( (~C).load(i+1UL,j ) );
2757 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
2758 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
2759 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
2761 for(
size_t k=kbegin; k<kend; ++k ) {
2762 const SIMDType a1(
set( A(i ,k) ) );
2763 const SIMDType a2(
set( A(i+1UL,k) ) );
2764 const SIMDType b1( B.load(k,j ) );
2765 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
2766 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
2767 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
2768 xmm1 = xmm1 + a1 * b1;
2769 xmm2 = xmm2 + a1 * b2;
2770 xmm3 = xmm3 + a1 * b3;
2771 xmm4 = xmm4 + a1 * b4;
2772 xmm5 = xmm5 + a2 * b1;
2773 xmm6 = xmm6 + a2 * b2;
2774 xmm7 = xmm7 + a2 * b3;
2775 xmm8 = xmm8 + a2 * b4;
2778 (~C).store( i , j , xmm1 );
2779 (~C).store( i , j+SIMDSIZE , xmm2 );
2780 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
2781 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
2782 (~C).store( i+1UL, j , xmm5 );
2783 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
2784 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
2785 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
2790 const size_t kbegin( ( IsUpper<MT4>::value )
2791 ?( ( IsLower<MT5>::value )
2792 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2793 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2794 :( IsLower<MT5>::value ? j : 0UL ) );
2795 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
2797 SIMDType xmm1( (~C).load(i,j ) );
2798 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
2799 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
2800 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
2802 for(
size_t k=kbegin; k<kend; ++k ) {
2803 const SIMDType a1(
set( A(i,k) ) );
2804 xmm1 = xmm1 + a1 * B.load(k,j );
2805 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
2806 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
2807 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
2810 (~C).store( i, j , xmm1 );
2811 (~C).store( i, j+SIMDSIZE , xmm2 );
2812 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
2813 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
2817 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2821 for( ; (i+2UL) <= M; i+=2UL )
2823 const size_t kbegin( ( IsUpper<MT4>::value )
2824 ?( ( IsLower<MT5>::value )
2825 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2826 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2827 :( IsLower<MT5>::value ? j : 0UL ) );
2828 const size_t kend( ( IsLower<MT4>::value )
2829 ?( ( IsUpper<MT5>::value )
2830 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
2831 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
2832 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
2834 SIMDType xmm1( (~C).load(i ,j ) );
2835 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
2836 SIMDType xmm3( (~C).load(i+1UL,j ) );
2837 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
2839 for(
size_t k=kbegin; k<kend; ++k ) {
2840 const SIMDType a1(
set( A(i ,k) ) );
2841 const SIMDType a2(
set( A(i+1UL,k) ) );
2842 const SIMDType b1( B.load(k,j ) );
2843 const SIMDType b2( B.load(k,j+SIMDSIZE) );
2844 xmm1 = xmm1 + a1 * b1;
2845 xmm2 = xmm2 + a1 * b2;
2846 xmm3 = xmm3 + a2 * b1;
2847 xmm4 = xmm4 + a2 * b2;
2850 (~C).store( i , j , xmm1 );
2851 (~C).store( i , j+SIMDSIZE, xmm2 );
2852 (~C).store( i+1UL, j , xmm3 );
2853 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
2858 const size_t kbegin( ( IsUpper<MT4>::value )
2859 ?( ( IsLower<MT5>::value )
2860 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2861 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2862 :( IsLower<MT5>::value ? j : 0UL ) );
2863 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
2865 SIMDType xmm1( (~C).load(i,j ) );
2866 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
2868 for(
size_t k=kbegin; k<kend; ++k ) {
2869 const SIMDType a1(
set( A(i,k) ) );
2870 xmm1 = xmm1 + a1 * B.load(k,j );
2871 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
2874 (~C).store( i, j , xmm1 );
2875 (~C).store( i, j+SIMDSIZE, xmm2 );
2879 for( ; j<jpos; j+=SIMDSIZE )
2883 for( ; (i+2UL) <= M; i+=2UL )
2885 const size_t kbegin( ( IsUpper<MT4>::value )
2886 ?( ( IsLower<MT5>::value )
2887 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2888 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2889 :( IsLower<MT5>::value ? j : 0UL ) );
2890 const size_t kend( ( IsLower<MT4>::value )
2891 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2894 SIMDType xmm1( (~C).load(i ,j) );
2895 SIMDType xmm2( (~C).load(i+1UL,j) );
2897 for(
size_t k=kbegin; k<kend; ++k ) {
2898 const SIMDType b1( B.load(k,j) );
2899 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
2900 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
2903 (~C).store( i , j, xmm1 );
2904 (~C).store( i+1UL, j, xmm2 );
2909 const size_t kbegin( ( IsUpper<MT4>::value )
2910 ?( ( IsLower<MT5>::value )
2911 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2912 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2913 :( IsLower<MT5>::value ? j : 0UL ) );
2915 SIMDType xmm1( (~C).load(i,j) );
2917 for(
size_t k=kbegin; k<K; ++k ) {
2918 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
2921 (~C).store( i, j, xmm1 );
2925 for( ; remainder && j<N; ++j )
2929 for( ; (i+2UL) <= M; i+=2UL )
2931 const size_t kbegin( ( IsUpper<MT4>::value )
2932 ?( ( IsLower<MT5>::value )
2933 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2934 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2935 :( IsLower<MT5>::value ? j : 0UL ) );
2936 const size_t kend( ( IsLower<MT4>::value )
2937 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
2940 ElementType value1( (~C)(i ,j) );
2941 ElementType value2( (~C)(i+1UL,j) );;
2943 for(
size_t k=kbegin; k<kend; ++k ) {
2944 value1 += A(i ,k) * B(k,j);
2945 value2 += A(i+1UL,k) * B(k,j);
2948 (~C)(i ,j) = value1;
2949 (~C)(i+1UL,j) = value2;
2954 const size_t kbegin( ( IsUpper<MT4>::value )
2955 ?( ( IsLower<MT5>::value )
2956 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
2957 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
2958 :( IsLower<MT5>::value ? j : 0UL ) );
2960 ElementType value( (~C)(i,j) );
2962 for(
size_t k=kbegin; k<K; ++k ) {
2963 value += A(i,k) * B(k,j);
2988 template<
typename MT3
2991 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
2992 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
2994 const size_t M( A.rows() );
2995 const size_t N( B.columns() );
2996 const size_t K( A.columns() );
2998 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3000 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3005 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
3006 for(
size_t j=0UL; j<N; ++j )
3008 const size_t kbegin( ( IsLower<MT5>::value )
3009 ?( ( IsUpper<MT4>::value )
3010 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3011 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3012 :( IsUpper<MT4>::value ? i : 0UL ) );
3013 const size_t kend( ( IsUpper<MT5>::value )
3014 ?( ( IsLower<MT4>::value )
3015 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
3016 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
3017 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
3019 SIMDType xmm1( (~C).load(i ,j) );
3020 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3021 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3022 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3023 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
3024 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
3025 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
3026 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
3028 for(
size_t k=kbegin; k<kend; ++k ) {
3029 const SIMDType b1(
set( B(k,j) ) );
3030 xmm1 = xmm1 + A.load(i ,k) * b1;
3031 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
3032 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
3033 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
3034 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
3035 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
3036 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
3037 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
3040 (~C).store( i , j, xmm1 );
3041 (~C).store( i+SIMDSIZE , j, xmm2 );
3042 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3043 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3044 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
3045 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
3046 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
3047 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
3051 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3055 for( ; (j+2UL) <= N; j+=2UL )
3057 const size_t kbegin( ( IsLower<MT5>::value )
3058 ?( ( IsUpper<MT4>::value )
3059 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3060 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3061 :( IsUpper<MT4>::value ? i : 0UL ) );
3062 const size_t kend( ( IsUpper<MT5>::value )
3063 ?( ( IsLower<MT4>::value )
3064 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3065 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3066 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
3068 SIMDType xmm1( (~C).load(i ,j ) );
3069 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
3070 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
3071 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
3072 SIMDType xmm5( (~C).load(i ,j+1UL) );
3073 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
3074 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
3075 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
3077 for(
size_t k=kbegin; k<kend; ++k ) {
3078 const SIMDType a1( A.load(i ,k) );
3079 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
3080 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
3081 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
3082 const SIMDType b1(
set( B(k,j ) ) );
3083 const SIMDType b2(
set( B(k,j+1UL) ) );
3084 xmm1 = xmm1 + a1 * b1;
3085 xmm2 = xmm2 + a2 * b1;
3086 xmm3 = xmm3 + a3 * b1;
3087 xmm4 = xmm4 + a4 * b1;
3088 xmm5 = xmm5 + a1 * b2;
3089 xmm6 = xmm6 + a2 * b2;
3090 xmm7 = xmm7 + a3 * b2;
3091 xmm8 = xmm8 + a4 * b2;
3094 (~C).store( i , j , xmm1 );
3095 (~C).store( i+SIMDSIZE , j , xmm2 );
3096 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
3097 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
3098 (~C).store( i , j+1UL, xmm5 );
3099 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
3100 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
3101 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
3106 const size_t kbegin( ( IsLower<MT5>::value )
3107 ?( ( IsUpper<MT4>::value )
3108 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3109 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3110 :( IsUpper<MT4>::value ? i : 0UL ) );
3111 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
3113 SIMDType xmm1( (~C).load(i ,j) );
3114 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
3115 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
3116 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
3118 for(
size_t k=kbegin; k<kend; ++k ) {
3119 const SIMDType b1(
set( B(k,j) ) );
3120 xmm1 = xmm1 + A.load(i ,k) * b1;
3121 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
3122 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
3123 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
3126 (~C).store( i , j, xmm1 );
3127 (~C).store( i+SIMDSIZE , j, xmm2 );
3128 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
3129 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
3133 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3137 for( ; (j+2UL) <= N; j+=2UL )
3139 const size_t kbegin( ( IsLower<MT5>::value )
3140 ?( ( IsUpper<MT4>::value )
3141 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3142 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3143 :( IsUpper<MT4>::value ? i : 0UL ) );
3144 const size_t kend( ( IsUpper<MT5>::value )
3145 ?( ( IsLower<MT4>::value )
3146 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
3147 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
3148 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
3150 SIMDType xmm1( (~C).load(i ,j ) );
3151 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
3152 SIMDType xmm3( (~C).load(i ,j+1UL) );
3153 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
3155 for(
size_t k=kbegin; k<kend; ++k ) {
3156 const SIMDType a1( A.load(i ,k) );
3157 const SIMDType a2( A.load(i+SIMDSIZE,k) );
3158 const SIMDType b1(
set( B(k,j ) ) );
3159 const SIMDType b2(
set( B(k,j+1UL) ) );
3160 xmm1 = xmm1 + a1 * b1;
3161 xmm2 = xmm2 + a2 * b1;
3162 xmm3 = xmm3 + a1 * b2;
3163 xmm4 = xmm4 + a2 * b2;
3166 (~C).store( i , j , xmm1 );
3167 (~C).store( i+SIMDSIZE, j , xmm2 );
3168 (~C).store( i , j+1UL, xmm3 );
3169 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
3174 const size_t kbegin( ( IsLower<MT5>::value )
3175 ?( ( IsUpper<MT4>::value )
3176 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3177 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3178 :( IsUpper<MT4>::value ? i : 0UL ) );
3179 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
3181 SIMDType xmm1( (~C).load(i ,j) );
3182 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
3184 for(
size_t k=kbegin; k<kend; ++k ) {
3185 const SIMDType b1(
set( B(k,j) ) );
3186 xmm1 = xmm1 + A.load(i ,k) * b1;
3187 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
3190 (~C).store( i , j, xmm1 );
3191 (~C).store( i+SIMDSIZE, j, xmm2 );
3195 for( ; i<ipos; i+=SIMDSIZE )
3199 for( ; (j+2UL) <= N; j+=2UL )
3201 const size_t kbegin( ( IsLower<MT5>::value )
3202 ?( ( IsUpper<MT4>::value )
3203 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3204 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3205 :( IsUpper<MT4>::value ? i : 0UL ) );
3206 const size_t kend( ( IsUpper<MT5>::value )
3207 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3210 SIMDType xmm1( (~C).load(i,j ) );
3211 SIMDType xmm2( (~C).load(i,j+1UL) );
3213 for(
size_t k=kbegin; k<kend; ++k ) {
3214 const SIMDType a1( A.load(i,k) );
3215 xmm1 = xmm1 + a1 *
set( B(k,j ) );
3216 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
3219 (~C).store( i, j , xmm1 );
3220 (~C).store( i, j+1UL, xmm2 );
3225 const size_t kbegin( ( IsLower<MT5>::value )
3226 ?( ( IsUpper<MT4>::value )
3227 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3228 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3229 :( IsUpper<MT4>::value ? i : 0UL ) );
3231 SIMDType xmm1( (~C).load(i,j) );
3233 for(
size_t k=kbegin; k<K; ++k ) {
3234 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
3237 (~C).store( i, j, xmm1 );
3241 for( ; remainder && i<M; ++i )
3245 for( ; (j+2UL) <= N; j+=2UL )
3247 const size_t kbegin( ( IsLower<MT5>::value )
3248 ?( ( IsUpper<MT4>::value )
3249 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3250 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3251 :( IsUpper<MT4>::value ? i : 0UL ) );
3252 const size_t kend( ( IsUpper<MT5>::value )
3253 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
3256 ElementType value1( (~C)(i,j ) );
3257 ElementType value2( (~C)(i,j+1UL) );
3259 for(
size_t k=kbegin; k<kend; ++k ) {
3260 value1 += A(i,k) * B(k,j );
3261 value2 += A(i,k) * B(k,j+1UL);
3264 (~C)(i,j ) = value1;
3265 (~C)(i,j+1UL) = value2;
3270 const size_t kbegin( ( IsLower<MT5>::value )
3271 ?( ( IsUpper<MT4>::value )
3272 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
3273 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
3274 :( IsUpper<MT4>::value ? i : 0UL ) );
3276 ElementType value( (~C)(i,j) );
3278 for(
size_t k=kbegin; k<K; ++k ) {
3279 value += A(i,k) * B(k,j);
3303 template<
typename MT3
3306 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3307 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3309 selectDefaultAddAssignKernel( C, A, B );
3329 template<
typename MT3
3332 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3333 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
3335 const size_t M( A.rows() );
3336 const size_t N( B.columns() );
3337 const size_t K( A.columns() );
3339 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
3341 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
3343 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3345 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3346 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3348 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
3350 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3352 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
3354 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3358 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3360 const size_t j1( j+SIMDSIZE );
3361 const size_t j2( j+SIMDSIZE*2UL );
3362 const size_t j3( j+SIMDSIZE*3UL );
3366 for( ; (i+2UL) <= iend; i+=2UL )
3368 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3369 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3370 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3371 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3373 SIMDType xmm1( (~C).load(i ,j ) );
3374 SIMDType xmm2( (~C).load(i ,j1) );
3375 SIMDType xmm3( (~C).load(i ,j2) );
3376 SIMDType xmm4( (~C).load(i ,j3) );
3377 SIMDType xmm5( (~C).load(i+1UL,j ) );
3378 SIMDType xmm6( (~C).load(i+1UL,j1) );
3379 SIMDType xmm7( (~C).load(i+1UL,j2) );
3380 SIMDType xmm8( (~C).load(i+1UL,j3) );
3382 for(
size_t k=kbegin; k<kend; ++k ) {
3383 const SIMDType a1(
set( A(i ,k) ) );
3384 const SIMDType a2(
set( A(i+1UL,k) ) );
3385 const SIMDType b1( B.load(k,j ) );
3386 const SIMDType b2( B.load(k,j1) );
3387 const SIMDType b3( B.load(k,j2) );
3388 const SIMDType b4( B.load(k,j3) );
3389 xmm1 = xmm1 + a1 * b1;
3390 xmm2 = xmm2 + a1 * b2;
3391 xmm3 = xmm3 + a1 * b3;
3392 xmm4 = xmm4 + a1 * b4;
3393 xmm5 = xmm5 + a2 * b1;
3394 xmm6 = xmm6 + a2 * b2;
3395 xmm7 = xmm7 + a2 * b3;
3396 xmm8 = xmm8 + a2 * b4;
3399 (~C).store( i , j , xmm1 );
3400 (~C).store( i , j1, xmm2 );
3401 (~C).store( i , j2, xmm3 );
3402 (~C).store( i , j3, xmm4 );
3403 (~C).store( i+1UL, j , xmm5 );
3404 (~C).store( i+1UL, j1, xmm6 );
3405 (~C).store( i+1UL, j2, xmm7 );
3406 (~C).store( i+1UL, j3, xmm8 );
3411 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3412 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3413 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3414 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
3416 SIMDType xmm1( (~C).load(i,j ) );
3417 SIMDType xmm2( (~C).load(i,j1) );
3418 SIMDType xmm3( (~C).load(i,j2) );
3419 SIMDType xmm4( (~C).load(i,j3) );
3421 for(
size_t k=kbegin; k<kend; ++k ) {
3422 const SIMDType a1(
set( A(i,k) ) );
3423 xmm1 = xmm1 + a1 * B.load(k,j );
3424 xmm2 = xmm2 + a1 * B.load(k,j1);
3425 xmm3 = xmm3 + a1 * B.load(k,j2);
3426 xmm4 = xmm4 + a1 * B.load(k,j3);
3429 (~C).store( i, j , xmm1 );
3430 (~C).store( i, j1, xmm2 );
3431 (~C).store( i, j2, xmm3 );
3432 (~C).store( i, j3, xmm4 );
3436 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3438 const size_t j1( j+SIMDSIZE );
3442 for( ; (i+4UL) <= iend; i+=4UL )
3444 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3445 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3446 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
3447 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3449 SIMDType xmm1( (~C).load(i ,j ) );
3450 SIMDType xmm2( (~C).load(i ,j1) );
3451 SIMDType xmm3( (~C).load(i+1UL,j ) );
3452 SIMDType xmm4( (~C).load(i+1UL,j1) );
3453 SIMDType xmm5( (~C).load(i+2UL,j ) );
3454 SIMDType xmm6( (~C).load(i+2UL,j1) );
3455 SIMDType xmm7( (~C).load(i+3UL,j ) );
3456 SIMDType xmm8( (~C).load(i+3UL,j1) );
3458 for(
size_t k=kbegin; k<kend; ++k ) {
3459 const SIMDType a1(
set( A(i ,k) ) );
3460 const SIMDType a2(
set( A(i+1UL,k) ) );
3461 const SIMDType a3(
set( A(i+2UL,k) ) );
3462 const SIMDType a4(
set( A(i+3UL,k) ) );
3463 const SIMDType b1( B.load(k,j ) );
3464 const SIMDType b2( B.load(k,j1) );
3465 xmm1 = xmm1 + a1 * b1;
3466 xmm2 = xmm2 + a1 * b2;
3467 xmm3 = xmm3 + a2 * b1;
3468 xmm4 = xmm4 + a2 * b2;
3469 xmm5 = xmm5 + a3 * b1;
3470 xmm6 = xmm6 + a3 * b2;
3471 xmm7 = xmm7 + a4 * b1;
3472 xmm8 = xmm8 + a4 * b2;
3475 (~C).store( i , j , xmm1 );
3476 (~C).store( i , j1, xmm2 );
3477 (~C).store( i+1UL, j , xmm3 );
3478 (~C).store( i+1UL, j1, xmm4 );
3479 (~C).store( i+2UL, j , xmm5 );
3480 (~C).store( i+2UL, j1, xmm6 );
3481 (~C).store( i+3UL, j , xmm7 );
3482 (~C).store( i+3UL, j1, xmm8 );
3485 for( ; (i+2UL) <= iend; i+=2UL )
3487 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3488 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3489 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
3490 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3492 SIMDType xmm1( (~C).load(i ,j ) );
3493 SIMDType xmm2( (~C).load(i ,j1) );
3494 SIMDType xmm3( (~C).load(i+1UL,j ) );
3495 SIMDType xmm4( (~C).load(i+1UL,j1) );
3497 for(
size_t k=kbegin; k<kend; ++k ) {
3498 const SIMDType a1(
set( A(i ,k) ) );
3499 const SIMDType a2(
set( A(i+1UL,k) ) );
3500 const SIMDType b1( B.load(k,j ) );
3501 const SIMDType b2( B.load(k,j1) );
3502 xmm1 = xmm1 + a1 * b1;
3503 xmm2 = xmm2 + a1 * b2;
3504 xmm3 = xmm3 + a2 * b1;
3505 xmm4 = xmm4 + a2 * b2;
3508 (~C).store( i , j , xmm1 );
3509 (~C).store( i , j1, xmm2 );
3510 (~C).store( i+1UL, j , xmm3 );
3511 (~C).store( i+1UL, j1, xmm4 );
3516 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3517 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3518 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3519 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
3521 SIMDType xmm1( (~C).load(i,j ) );
3522 SIMDType xmm2( (~C).load(i,j1) );
3524 for(
size_t k=kbegin; k<kend; ++k ) {
3525 const SIMDType a1(
set( A(i,k) ) );
3526 xmm1 = xmm1 + a1 * B.load(k,j );
3527 xmm2 = xmm2 + a1 * B.load(k,j1);
3530 (~C).store( i, j , xmm1 );
3531 (~C).store( i, j1, xmm2 );
3535 for( ; j<jpos; j+=SIMDSIZE )
3537 for(
size_t i=ii; i<iend; ++i )
3539 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3540 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3541 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3542 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
3544 SIMDType xmm1( (~C).load(i,j) );
3546 for(
size_t k=kbegin; k<kend; ++k ) {
3547 const SIMDType a1(
set( A(i,k) ) );
3548 xmm1 = xmm1 + a1 * B.load(k,j);
3551 (~C).store( i, j, xmm1 );
3555 for( ; remainder && j<jend; ++j )
3557 for(
size_t i=ii; i<iend; ++i )
3559 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3560 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3561 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
3562 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
3564 ElementType value( (~C)(i,j) );
3566 for(
size_t k=kbegin; k<kend; ++k ) {
3567 value += A(i,k) * B(k,j);
3595 template<
typename MT3
3598 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
3599 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
3601 const size_t M( A.rows() );
3602 const size_t N( B.columns() );
3603 const size_t K( A.columns() );
3605 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
3607 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
3609 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
3611 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3612 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3614 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
3616 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
3618 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
3620 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
3624 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3626 const size_t i1( i+SIMDSIZE );
3627 const size_t i2( i+SIMDSIZE*2UL );
3628 const size_t i3( i+SIMDSIZE*3UL );
3632 for( ; (j+2UL) <= jend; j+=2UL )
3634 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3635 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3636 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3637 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3639 SIMDType xmm1( (~C).load(i ,j ) );
3640 SIMDType xmm2( (~C).load(i1,j ) );
3641 SIMDType xmm3( (~C).load(i2,j ) );
3642 SIMDType xmm4( (~C).load(i3,j ) );
3643 SIMDType xmm5( (~C).load(i ,j+1UL) );
3644 SIMDType xmm6( (~C).load(i1,j+1UL) );
3645 SIMDType xmm7( (~C).load(i2,j+1UL) );
3646 SIMDType xmm8( (~C).load(i3,j+1UL) );
3648 for(
size_t k=kbegin; k<kend; ++k ) {
3649 const SIMDType a1( A.load(i ,k) );
3650 const SIMDType a2( A.load(i1,k) );
3651 const SIMDType a3( A.load(i2,k) );
3652 const SIMDType a4( A.load(i3,k) );
3653 const SIMDType b1(
set( B(k,j ) ) );
3654 const SIMDType b2(
set( B(k,j+1UL) ) );
3655 xmm1 = xmm1 + a1 * b1;
3656 xmm2 = xmm2 + a2 * b1;
3657 xmm3 = xmm3 + a3 * b1;
3658 xmm4 = xmm4 + a4 * b1;
3659 xmm5 = xmm5 + a1 * b2;
3660 xmm6 = xmm6 + a2 * b2;
3661 xmm7 = xmm7 + a3 * b2;
3662 xmm8 = xmm8 + a4 * b2;
3665 (~C).store( i , j , xmm1 );
3666 (~C).store( i1, j , xmm2 );
3667 (~C).store( i2, j , xmm3 );
3668 (~C).store( i3, j , xmm4 );
3669 (~C).store( i , j+1UL, xmm5 );
3670 (~C).store( i1, j+1UL, xmm6 );
3671 (~C).store( i2, j+1UL, xmm7 );
3672 (~C).store( i3, j+1UL, xmm8 );
3677 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3678 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3679 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
3680 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3682 SIMDType xmm1( (~C).load(i ,j) );
3683 SIMDType xmm2( (~C).load(i1,j) );
3684 SIMDType xmm3( (~C).load(i2,j) );
3685 SIMDType xmm4( (~C).load(i3,j) );
3687 for(
size_t k=kbegin; k<kend; ++k ) {
3688 const SIMDType b1(
set( B(k,j) ) );
3689 xmm1 = xmm1 + A.load(i ,k) * b1;
3690 xmm2 = xmm2 + A.load(i1,k) * b1;
3691 xmm3 = xmm3 + A.load(i2,k) * b1;
3692 xmm4 = xmm4 + A.load(i3,k) * b1;
3695 (~C).store( i , j, xmm1 );
3696 (~C).store( i1, j, xmm2 );
3697 (~C).store( i2, j, xmm3 );
3698 (~C).store( i3, j, xmm4 );
3702 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3704 const size_t i1( i+SIMDSIZE );
3708 for( ; (j+4UL) <= jend; j+=4UL )
3710 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3711 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3712 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3713 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
3715 SIMDType xmm1( (~C).load(i ,j ) );
3716 SIMDType xmm2( (~C).load(i1,j ) );
3717 SIMDType xmm3( (~C).load(i ,j+1UL) );
3718 SIMDType xmm4( (~C).load(i1,j+1UL) );
3719 SIMDType xmm5( (~C).load(i ,j+2UL) );
3720 SIMDType xmm6( (~C).load(i1,j+2UL) );
3721 SIMDType xmm7( (~C).load(i ,j+3UL) );
3722 SIMDType xmm8( (~C).load(i1,j+3UL) );
3724 for(
size_t k=kbegin; k<kend; ++k ) {
3725 const SIMDType a1( A.load(i ,k) );
3726 const SIMDType a2( A.load(i1,k) );
3727 const SIMDType b1(
set( B(k,j ) ) );
3728 const SIMDType b2(
set( B(k,j+1UL) ) );
3729 const SIMDType b3(
set( B(k,j+2UL) ) );
3730 const SIMDType b4(
set( B(k,j+3UL) ) );
3731 xmm1 = xmm1 + a1 * b1;
3732 xmm2 = xmm2 + a2 * b1;
3733 xmm3 = xmm3 + a1 * b2;
3734 xmm4 = xmm4 + a2 * b2;
3735 xmm5 = xmm5 + a1 * b3;
3736 xmm6 = xmm6 + a2 * b3;
3737 xmm7 = xmm7 + a1 * b4;
3738 xmm8 = xmm8 + a2 * b4;
3741 (~C).store( i , j , xmm1 );
3742 (~C).store( i1, j , xmm2 );
3743 (~C).store( i , j+1UL, xmm3 );
3744 (~C).store( i1, j+1UL, xmm4 );
3745 (~C).store( i , j+2UL, xmm5 );
3746 (~C).store( i1, j+2UL, xmm6 );
3747 (~C).store( i , j+3UL, xmm7 );
3748 (~C).store( i1, j+3UL, xmm8 );
3751 for( ; (j+2UL) <= jend; j+=2UL )
3753 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3754 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3755 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3756 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
3758 SIMDType xmm1( (~C).load(i ,j ) );
3759 SIMDType xmm2( (~C).load(i1,j ) );
3760 SIMDType xmm3( (~C).load(i ,j+1UL) );
3761 SIMDType xmm4( (~C).load(i1,j+1UL) );
3763 for(
size_t k=kbegin; k<kend; ++k ) {
3764 const SIMDType a1( A.load(i ,k) );
3765 const SIMDType a2( A.load(i1,k) );
3766 const SIMDType b1(
set( B(k,j ) ) );
3767 const SIMDType b2(
set( B(k,j+1UL) ) );
3768 xmm1 = xmm1 + a1 * b1;
3769 xmm2 = xmm2 + a2 * b1;
3770 xmm3 = xmm3 + a1 * b2;
3771 xmm4 = xmm4 + a2 * b2;
3774 (~C).store( i , j , xmm1 );
3775 (~C).store( i1, j , xmm2 );
3776 (~C).store( i , j+1UL, xmm3 );
3777 (~C).store( i1, j+1UL, xmm4 );
3782 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3783 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3784 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
3785 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3787 SIMDType xmm1( (~C).load(i ,j) );
3788 SIMDType xmm2( (~C).load(i1,j) );
3790 for(
size_t k=kbegin; k<kend; ++k ) {
3791 const SIMDType b1(
set( B(k,j) ) );
3792 xmm1 = xmm1 + A.load(i ,k) * b1;
3793 xmm2 = xmm2 + A.load(i1,k) * b1;
3796 (~C).store( i , j, xmm1 );
3797 (~C).store( i1, j, xmm2 );
3801 for( ; i<ipos; i+=SIMDSIZE )
3803 for(
size_t j=jj; j<jend; ++j )
3805 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3806 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3807 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
3808 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3810 SIMDType xmm1( (~C).load(i,j) );
3812 for(
size_t k=kbegin; k<kend; ++k ) {
3813 const SIMDType b1(
set( B(k,j) ) );
3814 xmm1 = xmm1 + A.load(i,k) * b1;
3817 (~C).store( i, j, xmm1 );
3821 for( ; remainder && i<iend; ++i )
3823 for(
size_t j=jj; j<jend; ++j )
3825 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
3826 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
3827 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
3828 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
3830 ElementType value( (~C)(i,j) );
3832 for(
size_t k=kbegin; k<kend; ++k ) {
3833 value += A(i,k) * B(k,j);
3860 template<
typename MT3
3863 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
3864 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3866 selectLargeAddAssignKernel( C, A, B );
3872 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3886 template<
typename MT3
3889 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
3890 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3892 typedef ElementType_<MT3> ET;
3894 if( IsTriangular<MT4>::value ) {
3895 ResultType_<MT3> tmp(
serial( B ) );
3896 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3897 addAssign( C, tmp );
3899 else if( IsTriangular<MT5>::value ) {
3900 ResultType_<MT3> tmp(
serial( A ) );
3901 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
3902 addAssign( C, tmp );
3905 gemm( C, A, B, ET(1), ET(1) );
3929 template<
typename MT
3931 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
3938 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3942 LT A(
serial( rhs.lhs_ ) );
3943 RT B(
serial( rhs.rhs_ ) );
3952 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3968 template<
typename MT3
3971 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3973 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
3974 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3975 selectSmallSubAssignKernel( C, A, B );
3977 selectBlasSubAssignKernel( C, A, B );
3996 template<
typename MT3
3999 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4000 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4002 const size_t M( A.rows() );
4003 const size_t N( B.columns() );
4004 const size_t K( A.columns() );
4006 for(
size_t i=0UL; i<M; ++i )
4008 const size_t kbegin( ( IsUpper<MT4>::value )
4009 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
4011 const size_t kend( ( IsLower<MT4>::value )
4012 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
4016 for(
size_t k=kbegin; k<kend; ++k )
4018 const size_t jbegin( ( IsUpper<MT5>::value )
4019 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
4021 const size_t jend( ( IsLower<MT5>::value )
4022 ?( IsStrictlyLower<MT5>::value ? k : k+1UL )
4026 const size_t jnum( jend - jbegin );
4027 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4029 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4030 (~C)(i,j ) -= A(i,k) * B(k,j );
4031 (~C)(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4034 (~C)(i,jpos) -= A(i,k) * B(k,jpos);
4056 template<
typename MT3
4059 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
4060 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4062 const size_t M( A.rows() );
4063 const size_t N( B.columns() );
4064 const size_t K( A.columns() );
4066 for(
size_t j=0UL; j<N; ++j )
4068 const size_t kbegin( ( IsLower<MT5>::value )
4069 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
4071 const size_t kend( ( IsUpper<MT5>::value )
4072 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
4076 for(
size_t k=kbegin; k<kend; ++k )
4078 const size_t ibegin( ( IsLower<MT4>::value )
4079 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
4081 const size_t iend( ( IsUpper<MT4>::value )
4082 ?( IsStrictlyUpper<MT4>::value ? k : k+1UL )
4086 const size_t inum( iend - ibegin );
4087 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4089 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4090 (~C)(i ,j) -= A(i ,k) * B(k,j);
4091 (~C)(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4094 (~C)(ipos,j) -= A(ipos,k) * B(k,j);
4116 template<
typename MT3
4119 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4120 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4122 const size_t M( A.rows() );
4123 const size_t N( B.columns() );
4125 const size_t block( BLOCK_SIZE );
4127 for(
size_t ii=0UL; ii<M; ii+=block ) {
4128 const size_t iend(
min( M, ii+block ) );
4129 for(
size_t jj=0UL; jj<N; jj+=block ) {
4130 const size_t jend(
min( N, jj+block ) );
4131 for(
size_t i=ii; i<iend; ++i )
4133 const size_t jbegin( ( IsUpper<MT4>::value )
4134 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
4136 const size_t jpos( ( IsLower<MT4>::value )
4137 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
4140 for(
size_t j=jbegin; j<jpos; ++j ) {
4141 (~C)(i,j) -= A(i,j) * B(j,j);
4164 template<
typename MT3
4167 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
4168 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4170 const size_t M( A.rows() );
4171 const size_t N( B.columns() );
4173 for(
size_t j=0UL; j<N; ++j )
4175 const size_t ibegin( ( IsLower<MT4>::value )
4176 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
4178 const size_t iend( ( IsUpper<MT4>::value )
4179 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
4183 const size_t inum( iend - ibegin );
4184 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4186 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4187 (~C)(i ,j) -= A(i ,j) * B(j,j);
4188 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4191 (~C)(ipos,j) -= A(ipos,j) * B(j,j);
4212 template<
typename MT3
4215 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4216 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4218 const size_t M( A.rows() );
4219 const size_t N( B.columns() );
4221 for(
size_t i=0UL; i<M; ++i )
4223 const size_t jbegin( ( IsUpper<MT5>::value )
4224 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
4226 const size_t jend( ( IsLower<MT5>::value )
4227 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
4231 const size_t jnum( jend - jbegin );
4232 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4234 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4235 (~C)(i,j ) -= A(i,i) * B(i,j );
4236 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4239 (~C)(i,jpos) -= A(i,i) * B(i,jpos);
4260 template<
typename MT3
4263 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
4264 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4266 const size_t M( A.rows() );
4267 const size_t N( B.columns() );
4269 const size_t block( BLOCK_SIZE );
4271 for(
size_t jj=0UL; jj<N; jj+=block ) {
4272 const size_t jend(
min( N, jj+block ) );
4273 for(
size_t ii=0UL; ii<M; ii+=block ) {
4274 const size_t iend(
min( M, ii+block ) );
4275 for(
size_t j=jj; j<jend; ++j )
4277 const size_t ibegin( ( IsLower<MT5>::value )
4278 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
4280 const size_t ipos( ( IsUpper<MT5>::value )
4281 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
4284 for(
size_t i=ibegin; i<ipos; ++i ) {
4285 (~C)(i,j) -= A(i,i) * B(i,j);
4308 template<
typename MT3
4311 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
4312 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4314 for(
size_t i=0UL; i<A.rows(); ++i ) {
4315 C(i,i) -= A(i,i) * B(i,i);
4335 template<
typename MT3
4338 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4339 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4341 selectDefaultSubAssignKernel( C, A, B );
4361 template<
typename MT3
4364 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4365 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
4367 const size_t M( A.rows() );
4368 const size_t N( B.columns() );
4369 const size_t K( A.columns() );
4371 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
4373 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4378 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
4379 for(
size_t i=0UL; i<M; ++i )
4381 const size_t kbegin( ( IsUpper<MT4>::value )
4382 ?( ( IsLower<MT5>::value )
4383 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4384 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4385 :( IsLower<MT5>::value ? j : 0UL ) );
4386 const size_t kend( ( IsLower<MT4>::value )
4387 ?( ( IsUpper<MT5>::value )
4388 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
4389 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
4390 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
4392 SIMDType xmm1( (~C).load(i,j ) );
4393 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4394 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4395 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4396 SIMDType xmm5( (~C).load(i,j+SIMDSIZE*4UL) );
4397 SIMDType xmm6( (~C).load(i,j+SIMDSIZE*5UL) );
4398 SIMDType xmm7( (~C).load(i,j+SIMDSIZE*6UL) );
4399 SIMDType xmm8( (~C).load(i,j+SIMDSIZE*7UL) );
4401 for(
size_t k=kbegin; k<kend; ++k ) {
4402 const SIMDType a1(
set( A(i,k) ) );
4403 xmm1 = xmm1 - a1 * B.load(k,j );
4404 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
4405 xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
4406 xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
4407 xmm5 = xmm5 - a1 * B.load(k,j+SIMDSIZE*4UL);
4408 xmm6 = xmm6 - a1 * B.load(k,j+SIMDSIZE*5UL);
4409 xmm7 = xmm7 - a1 * B.load(k,j+SIMDSIZE*6UL);
4410 xmm8 = xmm8 - a1 * B.load(k,j+SIMDSIZE*7UL);
4413 (~C).store( i, j , xmm1 );
4414 (~C).store( i, j+SIMDSIZE , xmm2 );
4415 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4416 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4417 (~C).store( i, j+SIMDSIZE*4UL, xmm5 );
4418 (~C).store( i, j+SIMDSIZE*5UL, xmm6 );
4419 (~C).store( i, j+SIMDSIZE*6UL, xmm7 );
4420 (~C).store( i, j+SIMDSIZE*7UL, xmm8 );
4424 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4428 for( ; (i+2UL) <= M; i+=2UL )
4430 const size_t kbegin( ( IsUpper<MT4>::value )
4431 ?( ( IsLower<MT5>::value )
4432 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4433 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4434 :( IsLower<MT5>::value ? j : 0UL ) );
4435 const size_t kend( ( IsLower<MT4>::value )
4436 ?( ( IsUpper<MT5>::value )
4437 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
4438 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4439 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
4441 SIMDType xmm1( (~C).load(i ,j ) );
4442 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE ) );
4443 SIMDType xmm3( (~C).load(i ,j+SIMDSIZE*2UL) );
4444 SIMDType xmm4( (~C).load(i ,j+SIMDSIZE*3UL) );
4445 SIMDType xmm5( (~C).load(i+1UL,j ) );
4446 SIMDType xmm6( (~C).load(i+1UL,j+SIMDSIZE ) );
4447 SIMDType xmm7( (~C).load(i+1UL,j+SIMDSIZE*2UL) );
4448 SIMDType xmm8( (~C).load(i+1UL,j+SIMDSIZE*3UL) );
4450 for(
size_t k=kbegin; k<kend; ++k ) {
4451 const SIMDType a1(
set( A(i ,k) ) );
4452 const SIMDType a2(
set( A(i+1UL,k) ) );
4453 const SIMDType b1( B.load(k,j ) );
4454 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
4455 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
4456 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
4457 xmm1 = xmm1 - a1 * b1;
4458 xmm2 = xmm2 - a1 * b2;
4459 xmm3 = xmm3 - a1 * b3;
4460 xmm4 = xmm4 - a1 * b4;
4461 xmm5 = xmm5 - a2 * b1;
4462 xmm6 = xmm6 - a2 * b2;
4463 xmm7 = xmm7 - a2 * b3;
4464 xmm8 = xmm8 - a2 * b4;
4467 (~C).store( i , j , xmm1 );
4468 (~C).store( i , j+SIMDSIZE , xmm2 );
4469 (~C).store( i , j+SIMDSIZE*2UL, xmm3 );
4470 (~C).store( i , j+SIMDSIZE*3UL, xmm4 );
4471 (~C).store( i+1UL, j , xmm5 );
4472 (~C).store( i+1UL, j+SIMDSIZE , xmm6 );
4473 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 );
4474 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 );
4479 const size_t kbegin( ( IsUpper<MT4>::value )
4480 ?( ( IsLower<MT5>::value )
4481 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4482 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4483 :( IsLower<MT5>::value ? j : 0UL ) );
4484 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
4486 SIMDType xmm1( (~C).load(i,j ) );
4487 SIMDType xmm2( (~C).load(i,j+SIMDSIZE ) );
4488 SIMDType xmm3( (~C).load(i,j+SIMDSIZE*2UL) );
4489 SIMDType xmm4( (~C).load(i,j+SIMDSIZE*3UL) );
4491 for(
size_t k=kbegin; k<kend; ++k ) {
4492 const SIMDType a1(
set( A(i,k) ) );
4493 xmm1 = xmm1 - a1 * B.load(k,j );
4494 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE );
4495 xmm3 = xmm3 - a1 * B.load(k,j+SIMDSIZE*2UL);
4496 xmm4 = xmm4 - a1 * B.load(k,j+SIMDSIZE*3UL);
4499 (~C).store( i, j , xmm1 );
4500 (~C).store( i, j+SIMDSIZE , xmm2 );
4501 (~C).store( i, j+SIMDSIZE*2UL, xmm3 );
4502 (~C).store( i, j+SIMDSIZE*3UL, xmm4 );
4506 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4510 for( ; (i+2UL) <= M; i+=2UL )
4512 const size_t kbegin( ( IsUpper<MT4>::value )
4513 ?( ( IsLower<MT5>::value )
4514 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4515 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4516 :( IsLower<MT5>::value ? j : 0UL ) );
4517 const size_t kend( ( IsLower<MT4>::value )
4518 ?( ( IsUpper<MT5>::value )
4519 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
4520 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
4521 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
4523 SIMDType xmm1( (~C).load(i ,j ) );
4524 SIMDType xmm2( (~C).load(i ,j+SIMDSIZE) );
4525 SIMDType xmm3( (~C).load(i+1UL,j ) );
4526 SIMDType xmm4( (~C).load(i+1UL,j+SIMDSIZE) );
4528 for(
size_t k=kbegin; k<kend; ++k ) {
4529 const SIMDType a1(
set( A(i ,k) ) );
4530 const SIMDType a2(
set( A(i+1UL,k) ) );
4531 const SIMDType b1( B.load(k,j ) );
4532 const SIMDType b2( B.load(k,j+SIMDSIZE) );
4533 xmm1 = xmm1 - a1 * b1;
4534 xmm2 = xmm2 - a1 * b2;
4535 xmm3 = xmm3 - a2 * b1;
4536 xmm4 = xmm4 - a2 * b2;
4539 (~C).store( i , j , xmm1 );
4540 (~C).store( i , j+SIMDSIZE, xmm2 );
4541 (~C).store( i+1UL, j , xmm3 );
4542 (~C).store( i+1UL, j+SIMDSIZE, xmm4 );
4547 const size_t kbegin( ( IsUpper<MT4>::value )
4548 ?( ( IsLower<MT5>::value )
4549 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4550 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4551 :( IsLower<MT5>::value ? j : 0UL ) );
4552 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
4554 SIMDType xmm1( (~C).load(i,j ) );
4555 SIMDType xmm2( (~C).load(i,j+SIMDSIZE) );
4557 for(
size_t k=kbegin; k<kend; ++k ) {
4558 const SIMDType a1(
set( A(i,k) ) );
4559 xmm1 = xmm1 - a1 * B.load(k,j );
4560 xmm2 = xmm2 - a1 * B.load(k,j+SIMDSIZE);
4563 (~C).store( i, j , xmm1 );
4564 (~C).store( i, j+SIMDSIZE, xmm2 );
4568 for( ; j<jpos; j+=SIMDSIZE )
4572 for( ; (i+2UL) <= M; i+=2UL )
4574 const size_t kbegin( ( IsUpper<MT4>::value )
4575 ?( ( IsLower<MT5>::value )
4576 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4577 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4578 :( IsLower<MT5>::value ? j : 0UL ) );
4579 const size_t kend( ( IsLower<MT4>::value )
4580 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4583 SIMDType xmm1( (~C).load(i ,j) );
4584 SIMDType xmm2( (~C).load(i+1UL,j) );
4586 for(
size_t k=kbegin; k<kend; ++k ) {
4587 const SIMDType b1( B.load(k,j) );
4588 xmm1 = xmm1 -
set( A(i ,k) ) * b1;
4589 xmm2 = xmm2 -
set( A(i+1UL,k) ) * b1;
4592 (~C).store( i , j, xmm1 );
4593 (~C).store( i+1UL, j, xmm2 );
4598 const size_t kbegin( ( IsUpper<MT4>::value )
4599 ?( ( IsLower<MT5>::value )
4600 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4601 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4602 :( IsLower<MT5>::value ? j : 0UL ) );
4604 SIMDType xmm1( (~C).load(i,j) );
4606 for(
size_t k=kbegin; k<K; ++k ) {
4607 xmm1 = xmm1 -
set( A(i,k) ) * B.load(k,j);
4610 (~C).store( i, j, xmm1 );
4614 for( ; remainder && j<N; ++j )
4618 for( ; (i+2UL) <= M; i+=2UL )
4620 const size_t kbegin( ( IsUpper<MT4>::value )
4621 ?( ( IsLower<MT5>::value )
4622 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4623 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4624 :( IsLower<MT5>::value ? j : 0UL ) );
4625 const size_t kend( ( IsLower<MT4>::value )
4626 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
4629 ElementType value1( (~C)(i ,j) );
4630 ElementType value2( (~C)(i+1UL,j) );
4632 for(
size_t k=kbegin; k<kend; ++k ) {
4633 value1 -= A(i ,k) * B(k,j);
4634 value2 -= A(i+1UL,k) * B(k,j);
4637 (~C)(i ,j) = value1;
4638 (~C)(i+1UL,j) = value2;
4643 const size_t kbegin( ( IsUpper<MT4>::value )
4644 ?( ( IsLower<MT5>::value )
4645 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
4646 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
4647 :( IsLower<MT5>::value ? j : 0UL ) );
4649 ElementType value( (~C)(i,j) );
4651 for(
size_t k=kbegin; k<K; ++k ) {
4652 value -= A(i,k) * B(k,j);
4677 template<
typename MT3
4680 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4681 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
4683 const size_t M( A.rows() );
4684 const size_t N( B.columns() );
4685 const size_t K( A.columns() );
4687 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
4689 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4694 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
4695 for(
size_t j=0UL; j<N; ++j )
4697 const size_t kbegin( ( IsLower<MT5>::value )
4698 ?( ( IsUpper<MT4>::value )
4699 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4700 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4701 :( IsUpper<MT4>::value ? i : 0UL ) );
4702 const size_t kend( ( IsUpper<MT5>::value )
4703 ?( ( IsLower<MT4>::value )
4704 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
4705 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
4706 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
4708 SIMDType xmm1( (~C).load(i ,j) );
4709 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4710 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4711 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4712 SIMDType xmm5( (~C).load(i+SIMDSIZE*4UL,j) );
4713 SIMDType xmm6( (~C).load(i+SIMDSIZE*5UL,j) );
4714 SIMDType xmm7( (~C).load(i+SIMDSIZE*6UL,j) );
4715 SIMDType xmm8( (~C).load(i+SIMDSIZE*7UL,j) );
4717 for(
size_t k=kbegin; k<kend; ++k ) {
4718 const SIMDType b1(
set( B(k,j) ) );
4719 xmm1 = xmm1 - A.load(i ,k) * b1;
4720 xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
4721 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
4722 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
4723 xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,k) * b1;
4724 xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,k) * b1;
4725 xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,k) * b1;
4726 xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,k) * b1;
4729 (~C).store( i , j, xmm1 );
4730 (~C).store( i+SIMDSIZE , j, xmm2 );
4731 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4732 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4733 (~C).store( i+SIMDSIZE*4UL, j, xmm5 );
4734 (~C).store( i+SIMDSIZE*5UL, j, xmm6 );
4735 (~C).store( i+SIMDSIZE*6UL, j, xmm7 );
4736 (~C).store( i+SIMDSIZE*7UL, j, xmm8 );
4740 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4744 for( ; (j+2UL) <= N; j+=2UL )
4746 const size_t kbegin( ( IsLower<MT5>::value )
4747 ?( ( IsUpper<MT4>::value )
4748 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4749 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4750 :( IsUpper<MT4>::value ? i : 0UL ) );
4751 const size_t kend( ( IsUpper<MT5>::value )
4752 ?( ( IsLower<MT4>::value )
4753 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4754 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4755 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
4757 SIMDType xmm1( (~C).load(i ,j ) );
4758 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j ) );
4759 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j ) );
4760 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j ) );
4761 SIMDType xmm5( (~C).load(i ,j+1UL) );
4762 SIMDType xmm6( (~C).load(i+SIMDSIZE ,j+1UL) );
4763 SIMDType xmm7( (~C).load(i+SIMDSIZE*2UL,j+1UL) );
4764 SIMDType xmm8( (~C).load(i+SIMDSIZE*3UL,j+1UL) );
4766 for(
size_t k=kbegin; k<kend; ++k ) {
4767 const SIMDType a1( A.load(i ,k) );
4768 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
4769 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
4770 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
4771 const SIMDType b1(
set( B(k,j ) ) );
4772 const SIMDType b2(
set( B(k,j+1UL) ) );
4773 xmm1 = xmm1 - a1 * b1;
4774 xmm2 = xmm2 - a2 * b1;
4775 xmm3 = xmm3 - a3 * b1;
4776 xmm4 = xmm4 - a4 * b1;
4777 xmm5 = xmm5 - a1 * b2;
4778 xmm6 = xmm6 - a2 * b2;
4779 xmm7 = xmm7 - a3 * b2;
4780 xmm8 = xmm8 - a4 * b2;
4783 (~C).store( i , j , xmm1 );
4784 (~C).store( i+SIMDSIZE , j , xmm2 );
4785 (~C).store( i+SIMDSIZE*2UL, j , xmm3 );
4786 (~C).store( i+SIMDSIZE*3UL, j , xmm4 );
4787 (~C).store( i , j+1UL, xmm5 );
4788 (~C).store( i+SIMDSIZE , j+1UL, xmm6 );
4789 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 );
4790 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 );
4795 const size_t kbegin( ( IsLower<MT5>::value )
4796 ?( ( IsUpper<MT4>::value )
4797 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4798 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4799 :( IsUpper<MT4>::value ? i : 0UL ) );
4800 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
4802 SIMDType xmm1( (~C).load(i ,j) );
4803 SIMDType xmm2( (~C).load(i+SIMDSIZE ,j) );
4804 SIMDType xmm3( (~C).load(i+SIMDSIZE*2UL,j) );
4805 SIMDType xmm4( (~C).load(i+SIMDSIZE*3UL,j) );
4807 for(
size_t k=kbegin; k<kend; ++k ) {
4808 const SIMDType b1(
set( B(k,j) ) );
4809 xmm1 = xmm1 - A.load(i ,k) * b1;
4810 xmm2 = xmm2 - A.load(i+SIMDSIZE ,k) * b1;
4811 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,k) * b1;
4812 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,k) * b1;
4815 (~C).store( i , j, xmm1 );
4816 (~C).store( i+SIMDSIZE , j, xmm2 );
4817 (~C).store( i+SIMDSIZE*2UL, j, xmm3 );
4818 (~C).store( i+SIMDSIZE*3UL, j, xmm4 );
4822 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4826 for( ; (j+2UL) <= N; j+=2UL )
4828 const size_t kbegin( ( IsLower<MT5>::value )
4829 ?( ( IsUpper<MT4>::value )
4830 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4831 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4832 :( IsUpper<MT4>::value ? i : 0UL ) );
4833 const size_t kend( ( IsUpper<MT5>::value )
4834 ?( ( IsLower<MT4>::value )
4835 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
4836 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
4837 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
4839 SIMDType xmm1( (~C).load(i ,j ) );
4840 SIMDType xmm2( (~C).load(i+SIMDSIZE,j ) );
4841 SIMDType xmm3( (~C).load(i ,j+1UL) );
4842 SIMDType xmm4( (~C).load(i+SIMDSIZE,j+1UL) );
4844 for(
size_t k=kbegin; k<kend; ++k ) {
4845 const SIMDType a1( A.load(i ,k) );
4846 const SIMDType a2( A.load(i+SIMDSIZE,k) );
4847 const SIMDType b1(
set( B(k,j ) ) );
4848 const SIMDType b2(
set( B(k,j+1UL) ) );
4849 xmm1 = xmm1 - a1 * b1;
4850 xmm2 = xmm2 - a2 * b1;
4851 xmm3 = xmm3 - a1 * b2;
4852 xmm4 = xmm4 - a2 * b2;
4855 (~C).store( i , j , xmm1 );
4856 (~C).store( i+SIMDSIZE, j , xmm2 );
4857 (~C).store( i , j+1UL, xmm3 );
4858 (~C).store( i+SIMDSIZE, j+1UL, xmm4 );
4863 const size_t kbegin( ( IsLower<MT5>::value )
4864 ?( ( IsUpper<MT4>::value )
4865 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4866 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4867 :( IsUpper<MT4>::value ? i : 0UL ) );
4868 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
4870 SIMDType xmm1( (~C).load(i ,j) );
4871 SIMDType xmm2( (~C).load(i+SIMDSIZE,j) );
4873 for(
size_t k=kbegin; k<kend; ++k ) {
4874 const SIMDType b1(
set( B(k,j) ) );
4875 xmm1 = xmm1 - A.load(i ,k) * b1;
4876 xmm2 = xmm2 - A.load(i+SIMDSIZE,k) * b1;
4879 (~C).store( i , j, xmm1 );
4880 (~C).store( i+SIMDSIZE, j, xmm2 );
4884 for( ; i<ipos; i+=SIMDSIZE )
4888 for( ; (j+2UL) <= N; j+=2UL )
4890 const size_t kbegin( ( IsLower<MT5>::value )
4891 ?( ( IsUpper<MT4>::value )
4892 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4893 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4894 :( IsUpper<MT4>::value ? i : 0UL ) );
4895 const size_t kend( ( IsUpper<MT5>::value )
4896 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4899 SIMDType xmm1( (~C).load(i,j ) );
4900 SIMDType xmm2( (~C).load(i,j+1UL) );
4902 for(
size_t k=kbegin; k<kend; ++k ) {
4903 const SIMDType a1( A.load(i,k) );
4904 xmm1 = xmm1 - a1 *
set( B(k,j ) );
4905 xmm2 = xmm2 - a1 *
set( B(k,j+1UL) );
4908 (~C).store( i, j , xmm1 );
4909 (~C).store( i, j+1UL, xmm2 );
4914 const size_t kbegin( ( IsLower<MT5>::value )
4915 ?( ( IsUpper<MT4>::value )
4916 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4917 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4918 :( IsUpper<MT4>::value ? i : 0UL ) );
4920 SIMDType xmm1( (~C).load(i,j) );
4922 for(
size_t k=kbegin; k<K; ++k ) {
4923 xmm1 = xmm1 - A.load(i,k) *
set( B(k,j) );
4926 (~C).store( i, j, xmm1 );
4930 for( ; remainder && i<M; ++i )
4934 for( ; (j+2UL) <= N; j+=2UL )
4936 const size_t kbegin( ( IsLower<MT5>::value )
4937 ?( ( IsUpper<MT4>::value )
4938 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4939 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4940 :( IsUpper<MT4>::value ? i : 0UL ) );
4941 const size_t kend( ( IsUpper<MT5>::value )
4942 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
4945 ElementType value1( (~C)(i,j ) );
4946 ElementType value2( (~C)(i,j+1UL) );
4948 for(
size_t k=kbegin; k<kend; ++k ) {
4949 value1 -= A(i,k) * B(k,j );
4950 value2 -= A(i,k) * B(k,j+1UL);
4953 (~C)(i,j ) = value1;
4954 (~C)(i,j+1UL) = value2;
4959 const size_t kbegin( ( IsLower<MT5>::value )
4960 ?( ( IsUpper<MT4>::value )
4961 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
4962 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
4963 :( IsUpper<MT4>::value ? i : 0UL ) );
4965 ElementType value( (~C)(i,j) );
4967 for(
size_t k=kbegin; k<K; ++k ) {
4968 value -= A(i,k) * B(k,j);
4992 template<
typename MT3
4995 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
4996 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4998 selectDefaultSubAssignKernel( C, A, B );
5018 template<
typename MT3
5021 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
5022 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B )
5024 const size_t M( A.rows() );
5025 const size_t N( B.columns() );
5026 const size_t K( A.columns() );
5028 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5030 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
5032 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5034 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
5035 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
5037 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
5039 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5041 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
5043 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5047 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
5049 const size_t j1( j+SIMDSIZE );
5050 const size_t j2( j+SIMDSIZE*2UL );
5051 const size_t j3( j+SIMDSIZE*3UL );
5055 for( ; (i+2UL) <= iend; i+=2UL )
5057 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5058 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5059 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5060 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5062 SIMDType xmm1( (~C).load(i ,j ) );
5063 SIMDType xmm2( (~C).load(i ,j1) );
5064 SIMDType xmm3( (~C).load(i ,j2) );
5065 SIMDType xmm4( (~C).load(i ,j3) );
5066 SIMDType xmm5( (~C).load(i+1UL,j ) );
5067 SIMDType xmm6( (~C).load(i+1UL,j1) );
5068 SIMDType xmm7( (~C).load(i+1UL,j2) );
5069 SIMDType xmm8( (~C).load(i+1UL,j3) );
5071 for(
size_t k=kbegin; k<kend; ++k ) {
5072 const SIMDType a1(
set( A(i ,k) ) );
5073 const SIMDType a2(
set( A(i+1UL,k) ) );
5074 const SIMDType b1( B.load(k,j ) );
5075 const SIMDType b2( B.load(k,j1) );
5076 const SIMDType b3( B.load(k,j2) );
5077 const SIMDType b4( B.load(k,j3) );
5078 xmm1 = xmm1 - a1 * b1;
5079 xmm2 = xmm2 - a1 * b2;
5080 xmm3 = xmm3 - a1 * b3;
5081 xmm4 = xmm4 - a1 * b4;
5082 xmm5 = xmm5 - a2 * b1;
5083 xmm6 = xmm6 - a2 * b2;
5084 xmm7 = xmm7 - a2 * b3;
5085 xmm8 = xmm8 - a2 * b4;
5088 (~C).store( i , j , xmm1 );
5089 (~C).store( i , j1, xmm2 );
5090 (~C).store( i , j2, xmm3 );
5091 (~C).store( i , j3, xmm4 );
5092 (~C).store( i+1UL, j , xmm5 );
5093 (~C).store( i+1UL, j1, xmm6 );
5094 (~C).store( i+1UL, j2, xmm7 );
5095 (~C).store( i+1UL, j3, xmm8 );
5100 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5101 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5102 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5103 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
5105 SIMDType xmm1( (~C).load(i,j ) );
5106 SIMDType xmm2( (~C).load(i,j1) );
5107 SIMDType xmm3( (~C).load(i,j2) );
5108 SIMDType xmm4( (~C).load(i,j3) );
5110 for(
size_t k=kbegin; k<kend; ++k ) {
5111 const SIMDType a1(
set( A(i,k) ) );
5112 xmm1 = xmm1 - a1 * B.load(k,j );
5113 xmm2 = xmm2 - a1 * B.load(k,j1);
5114 xmm3 = xmm3 - a1 * B.load(k,j2);
5115 xmm4 = xmm4 - a1 * B.load(k,j3);
5118 (~C).store( i, j , xmm1 );
5119 (~C).store( i, j1, xmm2 );
5120 (~C).store( i, j2, xmm3 );
5121 (~C).store( i, j3, xmm4 );
5125 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
5127 const size_t j1( j+SIMDSIZE );
5131 for( ; (i+4UL) <= iend; i+=4UL )
5133 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5134 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5135 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
5136 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5138 SIMDType xmm1( (~C).load(i ,j ) );
5139 SIMDType xmm2( (~C).load(i ,j1) );
5140 SIMDType xmm3( (~C).load(i+1UL,j ) );
5141 SIMDType xmm4( (~C).load(i+1UL,j1) );
5142 SIMDType xmm5( (~C).load(i+2UL,j ) );
5143 SIMDType xmm6( (~C).load(i+2UL,j1) );
5144 SIMDType xmm7( (~C).load(i+3UL,j ) );
5145 SIMDType xmm8( (~C).load(i+3UL,j1) );
5147 for(
size_t k=kbegin; k<kend; ++k ) {
5148 const SIMDType a1(
set( A(i ,k) ) );
5149 const SIMDType a2(
set( A(i+1UL,k) ) );
5150 const SIMDType a3(
set( A(i+2UL,k) ) );
5151 const SIMDType a4(
set( A(i+3UL,k) ) );
5152 const SIMDType b1( B.load(k,j ) );
5153 const SIMDType b2( B.load(k,j1) );
5154 xmm1 = xmm1 - a1 * b1;
5155 xmm2 = xmm2 - a1 * b2;
5156 xmm3 = xmm3 - a2 * b1;
5157 xmm4 = xmm4 - a2 * b2;
5158 xmm5 = xmm5 - a3 * b1;
5159 xmm6 = xmm6 - a3 * b2;
5160 xmm7 = xmm7 - a4 * b1;
5161 xmm8 = xmm8 - a4 * b2;
5164 (~C).store( i , j , xmm1 );
5165 (~C).store( i , j1, xmm2 );
5166 (~C).store( i+1UL, j , xmm3 );
5167 (~C).store( i+1UL, j1, xmm4 );
5168 (~C).store( i+2UL, j , xmm5 );
5169 (~C).store( i+2UL, j1, xmm6 );
5170 (~C).store( i+3UL, j , xmm7 );
5171 (~C).store( i+3UL, j1, xmm8 );
5174 for( ; (i+2UL) <= iend; i+=2UL )
5176 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5177 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5178 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
5179 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5181 SIMDType xmm1( (~C).load(i ,j ) );
5182 SIMDType xmm2( (~C).load(i ,j1) );
5183 SIMDType xmm3( (~C).load(i+1UL,j ) );
5184 SIMDType xmm4( (~C).load(i+1UL,j1) );
5186 for(
size_t k=kbegin; k<kend; ++k ) {
5187 const SIMDType a1(
set( A(i ,k) ) );
5188 const SIMDType a2(
set( A(i+1UL,k) ) );
5189 const SIMDType b1( B.load(k,j ) );
5190 const SIMDType b2( B.load(k,j1) );
5191 xmm1 = xmm1 - a1 * b1;
5192 xmm2 = xmm2 - a1 * b2;
5193 xmm3 = xmm3 - a2 * b1;
5194 xmm4 = xmm4 - a2 * b2;
5197 (~C).store( i , j , xmm1 );
5198 (~C).store( i , j1, xmm2 );
5199 (~C).store( i+1UL, j , xmm3 );
5200 (~C).store( i+1UL, j1, xmm4 );
5205 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5206 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5207 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5208 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
5210 SIMDType xmm1( (~C).load(i,j ) );
5211 SIMDType xmm2( (~C).load(i,j1) );
5213 for(
size_t k=kbegin; k<kend; ++k ) {
5214 const SIMDType a1(
set( A(i,k) ) );
5215 xmm1 = xmm1 - a1 * B.load(k,j );
5216 xmm2 = xmm2 - a1 * B.load(k,j1);
5219 (~C).store( i, j , xmm1 );
5220 (~C).store( i, j1, xmm2 );
5224 for( ; j<jpos; j+=SIMDSIZE )
5226 for(
size_t i=ii; i<iend; ++i )
5228 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5229 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5230 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5231 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
5233 SIMDType xmm1( (~C).load(i,j) );
5235 for(
size_t k=kbegin; k<kend; ++k ) {
5236 const SIMDType a1(
set( A(i,k) ) );
5237 xmm1 = xmm1 - a1 * B.load(k,j);
5240 (~C).store( i, j, xmm1 );
5244 for( ; remainder && j<jend; ++j )
5246 for(
size_t i=ii; i<iend; ++i )
5248 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5249 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5250 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
5251 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
5253 ElementType value( (~C)(i,j) );
5255 for(
size_t k=kbegin; k<kend; ++k ) {
5256 value -= A(i,k) * B(k,j);
5284 template<
typename MT3
5287 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5> >
5288 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B )
5290 const size_t M( A.rows() );
5291 const size_t N( B.columns() );
5292 const size_t K( A.columns() );
5294 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
5296 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
5298 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
5300 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
5301 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
5303 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
5305 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
5307 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
5309 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
5313 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
5315 const size_t i1( i+SIMDSIZE );
5316 const size_t i2( i+SIMDSIZE*2UL );
5317 const size_t i3( i+SIMDSIZE*3UL );
5321 for( ; (j+2UL) <= jend; j+=2UL )
5323 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5324 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5325 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5326 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5328 SIMDType xmm1( (~C).load(i ,j ) );
5329 SIMDType xmm2( (~C).load(i1,j ) );
5330 SIMDType xmm3( (~C).load(i2,j ) );
5331 SIMDType xmm4( (~C).load(i3,j ) );
5332 SIMDType xmm5( (~C).load(i ,j+1UL) );
5333 SIMDType xmm6( (~C).load(i1,j+1UL) );
5334 SIMDType xmm7( (~C).load(i2,j+1UL) );
5335 SIMDType xmm8( (~C).load(i3,j+1UL) );
5337 for(
size_t k=kbegin; k<kend; ++k ) {
5338 const SIMDType a1( A.load(i ,k) );
5339 const SIMDType a2( A.load(i1,k) );
5340 const SIMDType a3( A.load(i2,k) );
5341 const SIMDType a4( A.load(i3,k) );
5342 const SIMDType b1(
set( B(k,j ) ) );
5343 const SIMDType b2(
set( B(k,j+1UL) ) );
5344 xmm1 = xmm1 - a1 * b1;
5345 xmm2 = xmm2 - a2 * b1;
5346 xmm3 = xmm3 - a3 * b1;
5347 xmm4 = xmm4 - a4 * b1;
5348 xmm5 = xmm5 - a1 * b2;
5349 xmm6 = xmm6 - a2 * b2;
5350 xmm7 = xmm7 - a3 * b2;
5351 xmm8 = xmm8 - a4 * b2;
5354 (~C).store( i , j , xmm1 );
5355 (~C).store( i1, j , xmm2 );
5356 (~C).store( i2, j , xmm3 );
5357 (~C).store( i3, j , xmm4 );
5358 (~C).store( i , j+1UL, xmm5 );
5359 (~C).store( i1, j+1UL, xmm6 );
5360 (~C).store( i2, j+1UL, xmm7 );
5361 (~C).store( i3, j+1UL, xmm8 );
5366 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5367 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5368 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
5369 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5371 SIMDType xmm1( (~C).load(i ,j) );
5372 SIMDType xmm2( (~C).load(i1,j) );
5373 SIMDType xmm3( (~C).load(i2,j) );
5374 SIMDType xmm4( (~C).load(i3,j) );
5376 for(
size_t k=kbegin; k<kend; ++k ) {
5377 const SIMDType b1(
set( B(k,j) ) );
5378 xmm1 = xmm1 - A.load(i ,k) * b1;
5379 xmm2 = xmm2 - A.load(i1,k) * b1;
5380 xmm3 = xmm3 - A.load(i2,k) * b1;
5381 xmm4 = xmm4 - A.load(i3,k) * b1;
5384 (~C).store( i , j, xmm1 );
5385 (~C).store( i1, j, xmm2 );
5386 (~C).store( i2, j, xmm3 );
5387 (~C).store( i3, j, xmm4 );
5391 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
5393 const size_t i1( i+SIMDSIZE );
5397 for( ; (j+4UL) <= jend; j+=4UL )
5399 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5400 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5401 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5402 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
5404 SIMDType xmm1( (~C).load(i ,j ) );
5405 SIMDType xmm2( (~C).load(i1,j ) );
5406 SIMDType xmm3( (~C).load(i ,j+1UL) );
5407 SIMDType xmm4( (~C).load(i1,j+1UL) );
5408 SIMDType xmm5( (~C).load(i ,j+2UL) );
5409 SIMDType xmm6( (~C).load(i1,j+2UL) );
5410 SIMDType xmm7( (~C).load(i ,j+3UL) );
5411 SIMDType xmm8( (~C).load(i1,j+3UL) );
5413 for(
size_t k=kbegin; k<kend; ++k ) {
5414 const SIMDType a1( A.load(i ,k) );
5415 const SIMDType a2( A.load(i1,k) );
5416 const SIMDType b1(
set( B(k,j ) ) );
5417 const SIMDType b2(
set( B(k,j+1UL) ) );
5418 const SIMDType b3(
set( B(k,j+2UL) ) );
5419 const SIMDType b4(
set( B(k,j+3UL) ) );
5420 xmm1 = xmm1 - a1 * b1;
5421 xmm2 = xmm2 - a2 * b1;
5422 xmm3 = xmm3 - a1 * b2;
5423 xmm4 = xmm4 - a2 * b2;
5424 xmm5 = xmm5 - a1 * b3;
5425 xmm6 = xmm6 - a2 * b3;
5426 xmm7 = xmm7 - a1 * b4;
5427 xmm8 = xmm8 - a2 * b4;
5430 (~C).store( i , j , xmm1 );
5431 (~C).store( i1, j , xmm2 );
5432 (~C).store( i , j+1UL, xmm3 );
5433 (~C).store( i1, j+1UL, xmm4 );
5434 (~C).store( i , j+2UL, xmm5 );
5435 (~C).store( i1, j+2UL, xmm6 );
5436 (~C).store( i , j+3UL, xmm7 );
5437 (~C).store( i1, j+3UL, xmm8 );
5440 for( ; (j+2UL) <= jend; j+=2UL )
5442 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5443 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5444 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5445 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
5447 SIMDType xmm1( (~C).load(i ,j ) );
5448 SIMDType xmm2( (~C).load(i1,j ) );
5449 SIMDType xmm3( (~C).load(i ,j+1UL) );
5450 SIMDType xmm4( (~C).load(i1,j+1UL) );
5452 for(
size_t k=kbegin; k<kend; ++k ) {
5453 const SIMDType a1( A.load(i ,k) );
5454 const SIMDType a2( A.load(i1,k) );
5455 const SIMDType b1(
set( B(k,j ) ) );
5456 const SIMDType b2(
set( B(k,j+1UL) ) );
5457 xmm1 = xmm1 - a1 * b1;
5458 xmm2 = xmm2 - a2 * b1;
5459 xmm3 = xmm3 - a1 * b2;
5460 xmm4 = xmm4 - a2 * b2;
5463 (~C).store( i , j , xmm1 );
5464 (~C).store( i1, j , xmm2 );
5465 (~C).store( i , j+1UL, xmm3 );
5466 (~C).store( i1, j+1UL, xmm4 );
5471 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5472 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5473 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
5474 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5476 SIMDType xmm1( (~C).load(i ,j) );
5477 SIMDType xmm2( (~C).load(i1,j) );
5479 for(
size_t k=kbegin; k<kend; ++k ) {
5480 const SIMDType b1(
set( B(k,j) ) );
5481 xmm1 = xmm1 - A.load(i ,k) * b1;
5482 xmm2 = xmm2 - A.load(i1,k) * b1;
5485 (~C).store( i , j, xmm1 );
5486 (~C).store( i1, j, xmm2 );
5490 for( ; i<ipos; i+=SIMDSIZE )
5492 for(
size_t j=jj; j<jend; ++j )
5494 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5495 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5496 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
5497 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5499 SIMDType xmm1( (~C).load(i,j) );
5501 for(
size_t k=kbegin; k<kend; ++k ) {
5502 const SIMDType b1(
set( B(k,j) ) );
5503 xmm1 = xmm1 - A.load(i,k) * b1;
5506 (~C).store( i, j, xmm1 );
5510 for( ; remainder && i<iend; ++i )
5512 for(
size_t j=jj; j<jend; ++j )
5514 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
5515 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
5516 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
5517 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
5519 ElementType value( (~C)(i,j) );
5521 for(
size_t k=kbegin; k<kend; ++k ) {
5522 value -= A(i,k) * B(k,j);
5549 template<
typename MT3
5552 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5> >
5553 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5555 selectLargeSubAssignKernel( C, A, B );
5561 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5575 template<
typename MT3
5578 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5> >
5579 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5581 typedef ElementType_<MT3> ET;
5583 if( IsTriangular<MT4>::value ) {
5584 ResultType_<MT3> tmp(
serial( B ) );
5585 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5586 subAssign( C, tmp );
5588 else if( IsTriangular<MT5>::value ) {
5589 ResultType_<MT3> tmp(
serial( A ) );
5590 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(1) );
5591 subAssign( C, tmp );
5594 gemm( C, A, B, ET(-1), ET(1) );
5629 template<
typename MT
5631 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5639 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5642 else if( rhs.lhs_.columns() == 0UL ) {
5678 template<
typename MT
5680 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5685 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
5697 const TmpType tmp( rhs );
5719 template<
typename MT
5721 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5729 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5768 template<
typename MT
5770 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
5778 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5838 template<
typename MT1
5842 :
public DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2>, ST, true >, true >
5843 ,
private MatScalarMultExpr
5844 ,
private Computation
5848 typedef TDMatDMatMultExpr<MT1,MT2> MMM;
5849 typedef ResultType_<MMM> RES;
5850 typedef ResultType_<MT1>
RT1;
5851 typedef ResultType_<MT2>
RT2;
5852 typedef ElementType_<RT1>
ET1;
5853 typedef ElementType_<RT2>
ET2;
5854 typedef CompositeType_<MT1>
CT1;
5855 typedef CompositeType_<MT2>
CT2;
5860 enum :
bool { evaluateLeft = IsComputation<MT1>::value || RequiresEvaluation<MT1>::value };
5865 enum :
bool { evaluateRight = IsComputation<MT2>::value || RequiresEvaluation<MT2>::value };
5873 template<
typename T1,
typename T2,
typename T3 >
5874 struct IsEvaluationRequired {
5875 enum :
bool { value = ( evaluateLeft || evaluateRight ) };
5883 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5884 struct UseBlasKernel {
5886 HasMutableDataAccess<T1>::value &&
5887 HasConstDataAccess<T2>::value &&
5888 HasConstDataAccess<T3>::value &&
5889 !IsDiagonal<T2>::value && !IsDiagonal<T3>::value &&
5890 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5891 IsBLASCompatible< ElementType_<T1> >::value &&
5892 IsBLASCompatible< ElementType_<T2> >::value &&
5893 IsBLASCompatible< ElementType_<T3> >::value &&
5894 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
5895 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
5896 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
5904 template<
typename T1,
typename T2,
typename T3,
typename T4 >
5905 struct UseVectorizedDefaultKernel {
5907 !( IsDiagonal<T2>::value && IsDiagonal<T3>::value ) &&
5908 !( IsDiagonal<T2>::value && IsColumnMajorMatrix<T1>::value ) &&
5909 !( IsDiagonal<T3>::value && IsRowMajorMatrix<T1>::value ) &&
5910 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
5911 AreSIMDCombinable< ElementType_<T1>
5915 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
5916 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
5922 typedef DMatScalarMultExpr<MMM,ST,true>
This;
5927 typedef SIMDTrait_<ElementType>
SIMDType;
5932 typedef const TDMatDMatMultExpr<MT1,MT2>
LeftOperand;
5938 typedef IfTrue_< evaluateLeft, const RT1, CT1 >
LT;
5941 typedef IfTrue_< evaluateRight, const RT2, CT2 >
RT;
5946 enum :
bool { simdEnabled = !( IsDiagonal<MT1>::value && IsDiagonal<MT2>::value ) &&
5947 MT1::simdEnabled && MT2::simdEnabled &&
5948 AreSIMDCombinable<ET1,ET2,ST>::value &&
5949 HasSIMDAdd<ET1,ET2>::value &&
5950 HasSIMDMult<ET1,ET2>::value };
5953 enum :
bool { smpAssignable = !evaluateLeft && MT1::smpAssignable &&
5954 !evaluateRight && MT2::smpAssignable };
5968 explicit inline DMatScalarMultExpr(
const MMM& matrix, ST scalar )
5981 inline ResultType
operator()(
size_t i,
size_t j )
const {
5984 return matrix_(i,j) * scalar_;
5996 inline ReturnType
at(
size_t i,
size_t j )
const {
5997 if( i >= matrix_.rows() ) {
6000 if( j >= matrix_.columns() ) {
6003 return (*
this)(i,j);
6012 inline size_t rows()
const {
6013 return matrix_.rows();
6022 inline size_t columns()
const {
6023 return matrix_.columns();
6053 template<
typename T >
6054 inline bool canAlias(
const T* alias )
const {
6055 return matrix_.canAlias( alias );
6065 template<
typename T >
6066 inline bool isAliased(
const T* alias )
const {
6067 return matrix_.isAliased( alias );
6077 return matrix_.isAligned();
6088 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
6089 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
6095 LeftOperand matrix_;
6096 RightOperand scalar_;
6111 template<
typename MT
6113 friend inline void assign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6120 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
6121 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
6123 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6126 else if( left.columns() == 0UL ) {
6141 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
6156 template<
typename MT3
6160 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6162 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
6163 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
6164 selectSmallAssignKernel( C, A, B, scalar );
6166 selectBlasAssignKernel( C, A, B, scalar );
6184 template<
typename MT3
6188 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6189 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6191 const size_t M( A.rows() );
6192 const size_t N( B.columns() );
6193 const size_t K( A.columns() );
6195 for(
size_t i=0UL; i<M; ++i )
6197 const size_t kbegin( ( IsUpper<MT4>::value )
6198 ?( IsStrictlyUpper<MT4>::value ? i+1UL : i )
6200 const size_t kend( ( IsLower<MT4>::value )
6201 ?( IsStrictlyLower<MT4>::value ? i : i+1UL )
6205 if( IsStrictlyTriangular<MT4>::value && kbegin == kend ) {
6206 for(
size_t j=0UL; j<N; ++j ) {
6213 const size_t jbegin( ( IsUpper<MT5>::value )
6214 ?( IsStrictlyUpper<MT5>::value ? kbegin+1UL : kbegin )
6216 const size_t jend( ( IsLower<MT5>::value )
6217 ?( IsStrictlyLower<MT5>::value ? kbegin : kbegin+1UL )
6221 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6222 for(
size_t j=0UL; j<jbegin; ++j ) {
6226 else if( IsStrictlyUpper<MT5>::value ) {
6227 reset( (~C)(i,0UL) );
6229 for(
size_t j=jbegin; j<jend; ++j ) {
6230 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6232 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6233 for(
size_t j=jend; j<N; ++j ) {
6237 else if( IsStrictlyLower<MT5>::value ) {
6238 reset( (~C)(i,N-1UL) );
6242 for(
size_t k=kbegin+1UL; k<kend; ++k )
6244 const size_t jbegin( ( IsUpper<MT5>::value )
6245 ?( IsStrictlyUpper<MT5>::value ? k+1UL : k )
6247 const size_t jend( ( IsLower<MT5>::value )
6248 ?( IsStrictlyLower<MT5>::value ? k-1UL : k )
6252 for(
size_t j=jbegin; j<jend; ++j ) {
6253 (~C)(i,j) += A(i,k) * B(k,j);
6255 if( IsLower<MT5>::value ) {
6256 (~C)(i,jend) = A(i,k) * B(k,jend);
6261 const size_t jbegin( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6262 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? i+1UL : i )
6264 const size_t jend( ( IsLower<MT4>::value && IsLower<MT5>::value )
6265 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? i : i+1UL )
6269 for(
size_t j=jbegin; j<jend; ++j ) {
6270 (~C)(i,j) *= scalar;
6291 template<
typename MT3
6295 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
6296 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6298 const size_t M( A.rows() );
6299 const size_t N( B.columns() );
6300 const size_t K( A.columns() );
6302 for(
size_t j=0UL; j<N; ++j )
6304 const size_t kbegin( ( IsLower<MT5>::value )
6305 ?( IsStrictlyLower<MT5>::value ? j+1UL : j )
6307 const size_t kend( ( IsUpper<MT5>::value )
6308 ?( IsStrictlyUpper<MT5>::value ? j : j+1UL )
6312 if( IsStrictlyTriangular<MT5>::value && kbegin == kend ) {
6313 for(
size_t i=0UL; i<M; ++i ) {
6320 const size_t ibegin( ( IsLower<MT4>::value )
6321 ?( IsStrictlyLower<MT4>::value ? kbegin+1UL : kbegin )
6323 const size_t iend( ( IsUpper<MT4>::value )
6324 ?( IsStrictlyUpper<MT4>::value ? kbegin : kbegin+1UL )
6328 if( IsLower<MT4>::value && IsLower<MT5>::value ) {
6329 for(
size_t i=0UL; i<ibegin; ++i ) {
6333 else if( IsStrictlyLower<MT4>::value ) {
6334 reset( (~C)(0UL,j) );
6336 for(
size_t i=ibegin; i<iend; ++i ) {
6337 (~C)(i,j) = A(i,kbegin) * B(kbegin,j);
6339 if( IsUpper<MT4>::value && IsUpper<MT5>::value ) {
6340 for(
size_t i=iend; i<M; ++i ) {
6344 else if( IsStrictlyUpper<MT4>::value ) {
6345 reset( (~C)(M-1UL,j) );
6349 for(
size_t k=kbegin+1UL; k<kend; ++k )
6351 const size_t ibegin( ( IsLower<MT4>::value )
6352 ?( IsStrictlyLower<MT4>::value ? k+1UL : k )
6354 const size_t iend( ( IsUpper<MT4>::value )
6355 ?( IsStrictlyUpper<MT4>::value ? k-1UL : k )
6359 for(
size_t i=ibegin; i<iend; ++i ) {
6360 (~C)(i,j) += A(i,k) * B(k,j);
6362 if( IsUpper<MT4>::value ) {
6363 (~C)(iend,j) = A(iend,k) * B(k,j);
6368 const size_t ibegin( ( IsLower<MT4>::value && IsLower<MT5>::value )
6369 ?( IsStrictlyLower<MT4>::value || IsStrictlyLower<MT5>::value ? j+1UL : j )
6371 const size_t iend( ( IsUpper<MT4>::value && IsUpper<MT5>::value )
6372 ?( IsStrictlyUpper<MT4>::value || IsStrictlyUpper<MT5>::value ? j : j+1UL )
6376 for(
size_t i=ibegin; i<iend; ++i ) {
6377 (~C)(i,j) *= scalar;
6398 template<
typename MT3
6402 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6403 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6405 const size_t M( A.rows() );
6406 const size_t N( B.columns() );
6408 const size_t block( BLOCK_SIZE );
6410 for(
size_t ii=0UL; ii<M; ii+=block ) {
6411 const size_t iend(
min( M, ii+block ) );
6412 for(
size_t jj=0UL; jj<N; jj+=block ) {
6413 const size_t jend(
min( N, jj+block ) );
6414 for(
size_t i=ii; i<iend; ++i )
6416 const size_t jbegin( ( IsUpper<MT4>::value )
6417 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
6419 const size_t jpos( ( IsLower<MT4>::value )
6420 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
6423 if( IsUpper<MT4>::value ) {
6424 for(
size_t j=jj; j<jbegin; ++j ) {
6428 for(
size_t j=jbegin; j<jpos; ++j ) {
6429 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6431 if( IsLower<MT4>::value ) {
6432 for(
size_t j=jpos; j<jend; ++j ) {
6456 template<
typename MT3
6460 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
6461 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6463 const size_t M( A.rows() );
6464 const size_t N( B.columns() );
6466 for(
size_t j=0UL; j<N; ++j )
6468 const size_t ibegin( ( IsLower<MT4>::value )
6469 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
6471 const size_t iend( ( IsUpper<MT4>::value )
6472 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
6476 if( IsLower<MT4>::value ) {
6477 for(
size_t i=0UL; i<ibegin; ++i ) {
6481 for(
size_t i=ibegin; i<iend; ++i ) {
6482 (~C)(i,j) = A(i,j) * B(j,j) * scalar;
6484 if( IsUpper<MT4>::value ) {
6485 for(
size_t i=iend; i<M; ++i ) {
6507 template<
typename MT3
6511 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6512 selectDefaultAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6514 const size_t M( A.rows() );
6515 const size_t N( B.columns() );
6517 for(
size_t i=0UL; i<M; ++i )
6519 const size_t jbegin( ( IsUpper<MT5>::value )
6520 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
6522 const size_t jend( ( IsLower<MT5>::value )
6523 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
6527 if( IsUpper<MT5>::value ) {
6528 for(
size_t j=0UL; j<jbegin; ++j ) {
6532 for(
size_t j=jbegin; j<jend; ++j ) {
6533 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6535 if( IsLower<MT5>::value ) {
6536 for(
size_t j=jend; j<N; ++j ) {
6558 template<
typename MT3
6562 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
6563 selectDefaultAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6565 const size_t M( A.rows() );
6566 const size_t N( B.columns() );
6568 const size_t block( BLOCK_SIZE );
6570 for(
size_t jj=0UL; jj<N; jj+=block ) {
6571 const size_t jend(
min( N, jj+block ) );
6572 for(
size_t ii=0UL; ii<M; ii+=block ) {
6573 const size_t iend(
min( M, ii+block ) );
6574 for(
size_t j=jj; j<jend; ++j )
6576 const size_t ibegin( ( IsLower<MT5>::value )
6577 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
6579 const size_t ipos( ( IsUpper<MT5>::value )
6580 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
6583 if( IsLower<MT5>::value ) {
6584 for(
size_t i=ii; i<ibegin; ++i ) {
6588 for(
size_t i=ibegin; i<ipos; ++i ) {
6589 (~C)(i,j) = A(i,i) * B(i,j) * scalar;
6591 if( IsUpper<MT5>::value ) {
6592 for(
size_t i=ipos; i<iend; ++i ) {
6616 template<
typename MT3
6620 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
6621 selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6625 for(
size_t i=0UL; i<A.rows(); ++i ) {
6626 C(i,i) = A(i,i) * B(i,i) * scalar;
6645 template<
typename MT3
6649 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6650 selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6652 selectDefaultAssignKernel( C, A, B, scalar );
6671 template<
typename MT3
6675 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6676 selectSmallAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6678 const size_t M( A.rows() );
6679 const size_t N( B.columns() );
6680 const size_t K( A.columns() );
6682 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
6684 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
6687 const SIMDType factor(
set( scalar ) );
6691 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
6692 for(
size_t i=0UL; i<M; ++i )
6694 const size_t kbegin( ( IsUpper<MT4>::value )
6695 ?( ( IsLower<MT5>::value )
6696 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6697 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6698 :( IsLower<MT5>::value ? j : 0UL ) );
6699 const size_t kend( ( IsLower<MT4>::value )
6700 ?( ( IsUpper<MT5>::value )
6701 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
6702 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
6703 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
6705 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6707 for(
size_t k=kbegin; k<kend; ++k ) {
6708 const SIMDType a1(
set( A(i,k) ) );
6709 xmm1 = xmm1 + a1 * B.load(k,j );
6710 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6711 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6712 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6713 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
6714 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
6715 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
6716 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
6719 (~C).store( i, j , xmm1 * factor );
6720 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6721 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6722 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6723 (~C).store( i, j+SIMDSIZE*4UL, xmm5 * factor );
6724 (~C).store( i, j+SIMDSIZE*5UL, xmm6 * factor );
6725 (~C).store( i, j+SIMDSIZE*6UL, xmm7 * factor );
6726 (~C).store( i, j+SIMDSIZE*7UL, xmm8 * factor );
6730 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
6734 for( ; (i+2UL) <= M; i+=2UL )
6736 const size_t kbegin( ( IsUpper<MT4>::value )
6737 ?( ( IsLower<MT5>::value )
6738 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6739 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6740 :( IsLower<MT5>::value ? j : 0UL ) );
6741 const size_t kend( ( IsLower<MT4>::value )
6742 ?( ( IsUpper<MT5>::value )
6743 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
6744 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6745 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
6747 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6749 for(
size_t k=kbegin; k<kend; ++k ) {
6750 const SIMDType a1(
set( A(i ,k) ) );
6751 const SIMDType a2(
set( A(i+1UL,k) ) );
6752 const SIMDType b1( B.load(k,j ) );
6753 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
6754 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
6755 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
6756 xmm1 = xmm1 + a1 * b1;
6757 xmm2 = xmm2 + a1 * b2;
6758 xmm3 = xmm3 + a1 * b3;
6759 xmm4 = xmm4 + a1 * b4;
6760 xmm5 = xmm5 + a2 * b1;
6761 xmm6 = xmm6 + a2 * b2;
6762 xmm7 = xmm7 + a2 * b3;
6763 xmm8 = xmm8 + a2 * b4;
6766 (~C).store( i , j , xmm1 * factor );
6767 (~C).store( i , j+SIMDSIZE , xmm2 * factor );
6768 (~C).store( i , j+SIMDSIZE*2UL, xmm3 * factor );
6769 (~C).store( i , j+SIMDSIZE*3UL, xmm4 * factor );
6770 (~C).store( i+1UL, j , xmm5 * factor );
6771 (~C).store( i+1UL, j+SIMDSIZE , xmm6 * factor );
6772 (~C).store( i+1UL, j+SIMDSIZE*2UL, xmm7 * factor );
6773 (~C).store( i+1UL, j+SIMDSIZE*3UL, xmm8 * factor );
6778 const size_t kbegin( ( IsUpper<MT4>::value )
6779 ?( ( IsLower<MT5>::value )
6780 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6781 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6782 :( IsLower<MT5>::value ? j : 0UL ) );
6783 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
6785 SIMDType xmm1, xmm2, xmm3, xmm4;
6787 for(
size_t k=kbegin; k<kend; ++k ) {
6788 const SIMDType a1(
set( A(i,k) ) );
6789 xmm1 = xmm1 + a1 * B.load(k,j );
6790 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
6791 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
6792 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
6795 (~C).store( i, j , xmm1 * factor );
6796 (~C).store( i, j+SIMDSIZE , xmm2 * factor );
6797 (~C).store( i, j+SIMDSIZE*2UL, xmm3 * factor );
6798 (~C).store( i, j+SIMDSIZE*3UL, xmm4 * factor );
6802 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
6806 for( ; (i+2UL) <= M; i+=2UL )
6808 const size_t kbegin( ( IsUpper<MT4>::value )
6809 ?( ( IsLower<MT5>::value )
6810 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6811 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6812 :( IsLower<MT5>::value ? j : 0UL ) );
6813 const size_t kend( ( IsLower<MT4>::value )
6814 ?( ( IsUpper<MT5>::value )
6815 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
6816 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
6817 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
6819 SIMDType xmm1, xmm2, xmm3, xmm4;
6821 for(
size_t k=kbegin; k<kend; ++k ) {
6822 const SIMDType a1(
set( A(i ,k) ) );
6823 const SIMDType a2(
set( A(i+1UL,k) ) );
6824 const SIMDType b1( B.load(k,j ) );
6825 const SIMDType b2( B.load(k,j+SIMDSIZE) );
6826 xmm1 = xmm1 + a1 * b1;
6827 xmm2 = xmm2 + a1 * b2;
6828 xmm3 = xmm3 + a2 * b1;
6829 xmm4 = xmm4 + a2 * b2;
6832 (~C).store( i , j , xmm1 * factor );
6833 (~C).store( i , j+SIMDSIZE, xmm2 * factor );
6834 (~C).store( i+1UL, j , xmm3 * factor );
6835 (~C).store( i+1UL, j+SIMDSIZE, xmm4 * factor );
6840 const size_t kbegin( ( IsUpper<MT4>::value )
6841 ?( ( IsLower<MT5>::value )
6842 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6843 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6844 :( IsLower<MT5>::value ? j : 0UL ) );
6845 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
6847 SIMDType xmm1, xmm2;
6849 for(
size_t k=kbegin; k<kend; ++k ) {
6850 const SIMDType a1(
set( A(i,k) ) );
6851 xmm1 = xmm1 + a1 * B.load(k,j );
6852 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
6855 (~C).store( i, j , xmm1 * factor );
6856 (~C).store( i, j+SIMDSIZE, xmm2 * factor );
6860 for( ; j<jpos; j+=SIMDSIZE )
6864 for( ; (i+2UL) <= M; i+=2UL )
6866 const size_t kbegin( ( IsUpper<MT4>::value )
6867 ?( ( IsLower<MT5>::value )
6868 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6869 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6870 :( IsLower<MT5>::value ? j : 0UL ) );
6871 const size_t kend( ( IsLower<MT4>::value )
6872 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6875 SIMDType xmm1, xmm2;
6877 for(
size_t k=kbegin; k<kend; ++k ) {
6878 const SIMDType b1( B.load(k,j) );
6879 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
6880 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
6883 (~C).store( i , j, xmm1 * factor );
6884 (~C).store( i+1UL, j, xmm2 * factor );
6889 const size_t kbegin( ( IsUpper<MT4>::value )
6890 ?( ( IsLower<MT5>::value )
6891 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6892 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6893 :( IsLower<MT5>::value ? j : 0UL ) );
6897 for(
size_t k=kbegin; k<K; ++k ) {
6898 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
6901 (~C).store( i, j, xmm1 * factor );
6905 for( ; remainder && j<N; ++j )
6909 for( ; (i+2UL) <= M; i+=2UL )
6911 const size_t kbegin( ( IsUpper<MT4>::value )
6912 ?( ( IsLower<MT5>::value )
6913 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6914 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6915 :( IsLower<MT5>::value ? j : 0UL ) );
6916 const size_t kend( ( IsLower<MT4>::value )
6917 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
6923 for(
size_t k=kbegin; k<kend; ++k ) {
6924 value1 += A(i ,k) * B(k,j);
6925 value2 += A(i+1UL,k) * B(k,j);
6928 (~C)(i ,j) = value1 * scalar;
6929 (~C)(i+1UL,j) = value2 * scalar;
6934 const size_t kbegin( ( IsUpper<MT4>::value )
6935 ?( ( IsLower<MT5>::value )
6936 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
6937 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
6938 :( IsLower<MT5>::value ? j : 0UL ) );
6942 for(
size_t k=kbegin; k<K; ++k ) {
6943 value += A(i,k) * B(k,j);
6946 (~C)(i,j) = value * scalar;
6967 template<
typename MT3
6971 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
6972 selectSmallAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
6974 const size_t M( A.rows() );
6975 const size_t N( B.columns() );
6976 const size_t K( A.columns() );
6978 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
6980 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
6983 const SIMDType factor(
set( scalar ) );
6987 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
6988 for(
size_t j=0UL; j<N; ++j )
6990 const size_t kbegin( ( IsLower<MT5>::value )
6991 ?( ( IsUpper<MT4>::value )
6992 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
6993 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
6994 :( IsUpper<MT4>::value ? i : 0UL ) );
6995 const size_t kend( ( IsUpper<MT5>::value )
6996 ?( ( IsLower<MT4>::value )
6997 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
6998 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
6999 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
7001 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7003 for(
size_t k=kbegin; k<kend; ++k ) {
7004 const SIMDType b1(
set( B(k,j) ) );
7005 xmm1 = xmm1 + A.load(i ,k) * b1;
7006 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
7007 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
7008 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
7009 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
7010 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
7011 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
7012 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
7015 (~C).store( i , j, xmm1 * factor );
7016 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7017 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7018 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7019 (~C).store( i+SIMDSIZE*4UL, j, xmm5 * factor );
7020 (~C).store( i+SIMDSIZE*5UL, j, xmm6 * factor );
7021 (~C).store( i+SIMDSIZE*6UL, j, xmm7 * factor );
7022 (~C).store( i+SIMDSIZE*7UL, j, xmm8 * factor );
7026 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7030 for( ; (j+2UL) <= N; j+=2UL )
7032 const size_t kbegin( ( IsLower<MT5>::value )
7033 ?( ( IsUpper<MT4>::value )
7034 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7035 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7036 :( IsUpper<MT4>::value ? i : 0UL ) );
7037 const size_t kend( ( IsUpper<MT5>::value )
7038 ?( ( IsLower<MT4>::value )
7039 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7040 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7041 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
7043 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7045 for(
size_t k=kbegin; k<kend; ++k ) {
7046 const SIMDType a1( A.load(i ,k) );
7047 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
7048 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
7049 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
7050 const SIMDType b1(
set( B(k,j ) ) );
7051 const SIMDType b2(
set( B(k,j+1UL) ) );
7052 xmm1 = xmm1 + a1 * b1;
7053 xmm2 = xmm2 + a2 * b1;
7054 xmm3 = xmm3 + a3 * b1;
7055 xmm4 = xmm4 + a4 * b1;
7056 xmm5 = xmm5 + a1 * b2;
7057 xmm6 = xmm6 + a2 * b2;
7058 xmm7 = xmm7 + a3 * b2;
7059 xmm8 = xmm8 + a4 * b2;
7062 (~C).store( i , j , xmm1 * factor );
7063 (~C).store( i+SIMDSIZE , j , xmm2 * factor );
7064 (~C).store( i+SIMDSIZE*2UL, j , xmm3 * factor );
7065 (~C).store( i+SIMDSIZE*3UL, j , xmm4 * factor );
7066 (~C).store( i , j+1UL, xmm5 * factor );
7067 (~C).store( i+SIMDSIZE , j+1UL, xmm6 * factor );
7068 (~C).store( i+SIMDSIZE*2UL, j+1UL, xmm7 * factor );
7069 (~C).store( i+SIMDSIZE*3UL, j+1UL, xmm8 * factor );
7074 const size_t kbegin( ( IsLower<MT5>::value )
7075 ?( ( IsUpper<MT4>::value )
7076 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7077 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7078 :( IsUpper<MT4>::value ? i : 0UL ) );
7079 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
7081 SIMDType xmm1, xmm2, xmm3, xmm4;
7083 for(
size_t k=kbegin; k<kend; ++k ) {
7084 const SIMDType b1(
set( B(k,j) ) );
7085 xmm1 = xmm1 + A.load(i ,k) * b1;
7086 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
7087 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
7088 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
7091 (~C).store( i , j, xmm1 * factor );
7092 (~C).store( i+SIMDSIZE , j, xmm2 * factor );
7093 (~C).store( i+SIMDSIZE*2UL, j, xmm3 * factor );
7094 (~C).store( i+SIMDSIZE*3UL, j, xmm4 * factor );
7098 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7102 for( ; (j+2UL) <= N; j+=2UL )
7104 const size_t kbegin( ( IsLower<MT5>::value )
7105 ?( ( IsUpper<MT4>::value )
7106 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7107 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7108 :( IsUpper<MT4>::value ? i : 0UL ) );
7109 const size_t kend( ( IsUpper<MT5>::value )
7110 ?( ( IsLower<MT4>::value )
7111 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
7112 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
7113 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
7115 SIMDType xmm1, xmm2, xmm3, xmm4;
7117 for(
size_t k=kbegin; k<kend; ++k ) {
7118 const SIMDType a1( A.load(i ,k) );
7119 const SIMDType a2( A.load(i+SIMDSIZE,k) );
7120 const SIMDType b1(
set( B(k,j ) ) );
7121 const SIMDType b2(
set( B(k,j+1UL) ) );
7122 xmm1 = xmm1 + a1 * b1;
7123 xmm2 = xmm2 + a2 * b1;
7124 xmm3 = xmm3 + a1 * b2;
7125 xmm4 = xmm4 + a2 * b2;
7128 (~C).store( i , j , xmm1 * factor );
7129 (~C).store( i+SIMDSIZE, j , xmm2 * factor );
7130 (~C).store( i , j+1UL, xmm3 * factor );
7131 (~C).store( i+SIMDSIZE, j+1UL, xmm4 * factor );
7136 const size_t kbegin( ( IsLower<MT5>::value )
7137 ?( ( IsUpper<MT4>::value )
7138 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7139 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7140 :( IsUpper<MT4>::value ? i : 0UL ) );
7141 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
7143 SIMDType xmm1, xmm2;
7145 for(
size_t k=kbegin; k<kend; ++k ) {
7146 const SIMDType b1(
set( B(k,j) ) );
7147 xmm1 = xmm1 + A.load(i ,k) * b1;
7148 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
7151 (~C).store( i , j, xmm1 * factor );
7152 (~C).store( i+SIMDSIZE, j, xmm2 * factor );
7156 for( ; i<ipos; i+=SIMDSIZE )
7160 for( ; (j+2UL) <= N; j+=2UL )
7162 const size_t kbegin( ( IsLower<MT5>::value )
7163 ?( ( IsUpper<MT4>::value )
7164 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7165 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7166 :( IsUpper<MT4>::value ? i : 0UL ) );
7167 const size_t kend( ( IsUpper<MT5>::value )
7168 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7171 SIMDType xmm1, xmm2;
7173 for(
size_t k=kbegin; k<kend; ++k ) {
7174 const SIMDType a1( A.load(i,k) );
7175 xmm1 = xmm1 + a1 *
set( B(k,j ) );
7176 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
7179 (~C).store( i, j , xmm1 * factor );
7180 (~C).store( i, j+1UL, xmm2 * factor );
7185 const size_t kbegin( ( IsLower<MT5>::value )
7186 ?( ( IsUpper<MT4>::value )
7187 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7188 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7189 :( IsUpper<MT4>::value ? i : 0UL ) );
7193 for(
size_t k=kbegin; k<K; ++k ) {
7194 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
7197 (~C).store( i, j, xmm1 * factor );
7201 for( ; remainder && i<M; ++i )
7205 for( ; (j+2UL) <= N; j+=2UL )
7207 const size_t kbegin( ( IsLower<MT5>::value )
7208 ?( ( IsUpper<MT4>::value )
7209 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7210 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7211 :( IsUpper<MT4>::value ? i : 0UL ) );
7212 const size_t kend( ( IsUpper<MT5>::value )
7213 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
7219 for(
size_t k=kbegin; k<kend; ++k ) {
7220 value1 += A(i,k) * B(k,j );
7221 value2 += A(i,k) * B(k,j+1UL);
7224 (~C)(i,j ) = value1 * scalar;
7225 (~C)(i,j+1UL) = value2 * scalar;
7230 const size_t kbegin( ( IsLower<MT5>::value )
7231 ?( ( IsUpper<MT4>::value )
7232 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
7233 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
7234 :( IsUpper<MT4>::value ? i : 0UL ) );
7238 for(
size_t k=kbegin; k<K; ++k ) {
7239 value += A(i,k) * B(k,j);
7242 (~C)(i,j) = value * scalar;
7262 template<
typename MT3
7266 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7267 selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7269 selectDefaultAssignKernel( C, A, B, scalar );
7288 template<
typename MT3
7292 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7293 selectLargeAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7295 const size_t M( A.rows() );
7296 const size_t N( B.columns() );
7297 const size_t K( A.columns() );
7299 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
7301 const SIMDType factor(
set( scalar ) );
7303 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
7305 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7307 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
7308 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
7310 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
7312 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7314 for(
size_t i=ii; i<iend; ++i ) {
7315 for(
size_t j=jj; j<jend; ++j ) {
7320 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
7322 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7326 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
7328 const size_t j1( j+SIMDSIZE );
7329 const size_t j2( j+SIMDSIZE*2UL );
7330 const size_t j3( j+SIMDSIZE*3UL );
7334 for( ; (i+2UL) <= iend; i+=2UL )
7336 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7337 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7338 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7339 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7341 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7343 for(
size_t k=kbegin; k<kend; ++k ) {
7344 const SIMDType a1(
set( A(i ,k) ) );
7345 const SIMDType a2(
set( A(i+1UL,k) ) );
7346 const SIMDType b1( B.load(k,j ) );
7347 const SIMDType b2( B.load(k,j1) );
7348 const SIMDType b3( B.load(k,j2) );
7349 const SIMDType b4( B.load(k,j3) );
7350 xmm1 = xmm1 + a1 * b1;
7351 xmm2 = xmm2 + a1 * b2;
7352 xmm3 = xmm3 + a1 * b3;
7353 xmm4 = xmm4 + a1 * b4;
7354 xmm5 = xmm5 + a2 * b1;
7355 xmm6 = xmm6 + a2 * b2;
7356 xmm7 = xmm7 + a2 * b3;
7357 xmm8 = xmm8 + a2 * b4;
7360 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7361 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7362 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
7363 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
7364 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
7365 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
7366 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
7367 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
7372 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7373 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7374 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7375 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
7377 SIMDType xmm1, xmm2, xmm3, xmm4;
7379 for(
size_t k=kbegin; k<kend; ++k ) {
7380 const SIMDType a1(
set( A(i,k) ) );
7381 xmm1 = xmm1 + a1 * B.load(k,j );
7382 xmm2 = xmm2 + a1 * B.load(k,j1);
7383 xmm3 = xmm3 + a1 * B.load(k,j2);
7384 xmm4 = xmm4 + a1 * B.load(k,j3);
7387 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7388 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7389 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
7390 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
7394 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
7396 const size_t j1( j+SIMDSIZE );
7400 for( ; (i+4UL) <= iend; i+=4UL )
7402 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7403 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7404 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
7405 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7407 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7409 for(
size_t k=kbegin; k<kend; ++k ) {
7410 const SIMDType a1(
set( A(i ,k) ) );
7411 const SIMDType a2(
set( A(i+1UL,k) ) );
7412 const SIMDType a3(
set( A(i+2UL,k) ) );
7413 const SIMDType a4(
set( A(i+3UL,k) ) );
7414 const SIMDType b1( B.load(k,j ) );
7415 const SIMDType b2( B.load(k,j1) );
7416 xmm1 = xmm1 + a1 * b1;
7417 xmm2 = xmm2 + a1 * b2;
7418 xmm3 = xmm3 + a2 * b1;
7419 xmm4 = xmm4 + a2 * b2;
7420 xmm5 = xmm5 + a3 * b1;
7421 xmm6 = xmm6 + a3 * b2;
7422 xmm7 = xmm7 + a4 * b1;
7423 xmm8 = xmm8 + a4 * b2;
7426 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7427 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7428 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7429 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7430 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
7431 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
7432 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
7433 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
7436 for( ; (i+2UL) <= iend; i+=2UL )
7438 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7439 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7440 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
7441 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7443 SIMDType xmm1, xmm2, xmm3, xmm4;
7445 for(
size_t k=kbegin; k<kend; ++k ) {
7446 const SIMDType a1(
set( A(i ,k) ) );
7447 const SIMDType a2(
set( A(i+1UL,k) ) );
7448 const SIMDType b1( B.load(k,j ) );
7449 const SIMDType b2( B.load(k,j1) );
7450 xmm1 = xmm1 + a1 * b1;
7451 xmm2 = xmm2 + a1 * b2;
7452 xmm3 = xmm3 + a2 * b1;
7453 xmm4 = xmm4 + a2 * b2;
7456 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7457 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
7458 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
7459 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
7464 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7465 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7466 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7467 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
7469 SIMDType xmm1, xmm2;
7471 for(
size_t k=kbegin; k<kend; ++k ) {
7472 const SIMDType a1(
set( A(i,k) ) );
7473 xmm1 = xmm1 + a1 * B.load(k,j );
7474 xmm2 = xmm2 + a1 * B.load(k,j1);
7477 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
7478 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
7482 for( ; j<jpos; j+=SIMDSIZE )
7484 for(
size_t i=ii; i<iend; ++i )
7486 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7487 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7488 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7489 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
7493 for(
size_t k=kbegin; k<kend; ++k ) {
7494 const SIMDType a1(
set( A(i,k) ) );
7495 xmm1 = xmm1 + a1 * B.load(k,j);
7498 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7502 for( ; remainder && j<jend; ++j )
7504 for(
size_t i=ii; i<iend; ++i )
7506 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7507 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7508 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
7509 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
7513 for(
size_t k=kbegin; k<kend; ++k ) {
7514 value += A(i,k) * B(k,j);
7517 (~C)(i,j) += value * scalar;
7541 template<
typename MT3
7545 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
7546 selectLargeAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7548 const size_t M( A.rows() );
7549 const size_t N( B.columns() );
7550 const size_t K( A.columns() );
7552 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
7554 const SIMDType factor(
set( scalar ) );
7556 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
7558 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
7560 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
7561 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
7563 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
7565 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
7567 for(
size_t j=jj; j<jend; ++j ) {
7568 for(
size_t i=ii; i<iend; ++i ) {
7573 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
7575 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
7579 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
7581 const size_t i1( i+SIMDSIZE );
7582 const size_t i2( i+SIMDSIZE*2UL );
7583 const size_t i3( i+SIMDSIZE*3UL );
7587 for( ; (j+2UL) <= jend; j+=2UL )
7589 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7590 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7591 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7592 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7596 for(
size_t k=kbegin; k<kend; ++k ) {
7597 const SIMDType a1( A.load(i ,k) );
7598 const SIMDType a2( A.load(i1,k) );
7599 const SIMDType a3( A.load(i2,k) );
7600 const SIMDType a4( A.load(i3,k) );
7601 const SIMDType b1(
set( B(k,j ) ) );
7602 const SIMDType b2(
set( B(k,j+1UL) ) );
7603 xmm1 = xmm1 + a1 * b1;
7604 xmm2 = xmm2 + a2 * b1;
7605 xmm3 = xmm3 + a3 * b1;
7606 xmm4 = xmm4 + a4 * b1;
7607 xmm5 = xmm5 + a1 * b2;
7608 xmm6 = xmm6 + a2 * b2;
7609 xmm7 = xmm7 + a3 * b2;
7610 xmm8 = xmm8 + a4 * b2;
7613 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7614 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7615 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
7616 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
7617 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
7618 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
7619 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
7620 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
7625 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7626 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7627 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
7628 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7630 SIMDType xmm1, xmm2, xmm3, xmm4;
7632 for(
size_t k=kbegin; k<kend; ++k ) {
7633 const SIMDType b1(
set( B(k,j) ) );
7634 xmm1 = xmm1 + A.load(i ,k) * b1;
7635 xmm2 = xmm2 + A.load(i1,k) * b1;
7636 xmm3 = xmm3 + A.load(i2,k) * b1;
7637 xmm4 = xmm4 + A.load(i3,k) * b1;
7640 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7641 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7642 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
7643 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
7647 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
7649 const size_t i1( i+SIMDSIZE );
7653 for( ; (j+4UL) <= jend; j+=4UL )
7655 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7656 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7657 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7658 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
7660 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7662 for(
size_t k=kbegin; k<kend; ++k ) {
7663 const SIMDType a1( A.load(i ,k) );
7664 const SIMDType a2( A.load(i1,k) );
7665 const SIMDType b1(
set( B(k,j ) ) );
7666 const SIMDType b2(
set( B(k,j+1UL) ) );
7667 const SIMDType b3(
set( B(k,j+2UL) ) );
7668 const SIMDType b4(
set( B(k,j+3UL) ) );
7669 xmm1 = xmm1 + a1 * b1;
7670 xmm2 = xmm2 + a2 * b1;
7671 xmm3 = xmm3 + a1 * b2;
7672 xmm4 = xmm4 + a2 * b2;
7673 xmm5 = xmm5 + a1 * b3;
7674 xmm6 = xmm6 + a2 * b3;
7675 xmm7 = xmm7 + a1 * b4;
7676 xmm8 = xmm8 + a2 * b4;
7679 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7680 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7681 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7682 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7683 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
7684 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
7685 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
7686 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
7689 for( ; (j+2UL) <= jend; j+=2UL )
7691 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7692 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7693 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7694 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
7696 SIMDType xmm1, xmm2, xmm3, xmm4;
7698 for(
size_t k=kbegin; k<kend; ++k ) {
7699 const SIMDType a1( A.load(i ,k) );
7700 const SIMDType a2( A.load(i1,k) );
7701 const SIMDType b1(
set( B(k,j ) ) );
7702 const SIMDType b2(
set( B(k,j+1UL) ) );
7703 xmm1 = xmm1 + a1 * b1;
7704 xmm2 = xmm2 + a2 * b1;
7705 xmm3 = xmm3 + a1 * b2;
7706 xmm4 = xmm4 + a2 * b2;
7709 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
7710 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
7711 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
7712 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
7717 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7718 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7719 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
7720 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7722 SIMDType xmm1, xmm2;
7724 for(
size_t k=kbegin; k<kend; ++k ) {
7725 const SIMDType b1(
set( B(k,j) ) );
7726 xmm1 = xmm1 + A.load(i ,k) * b1;
7727 xmm2 = xmm2 + A.load(i1,k) * b1;
7730 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
7731 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
7735 for( ; i<ipos; i+=SIMDSIZE )
7737 for(
size_t j=jj; j<jend; ++j )
7739 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7740 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7741 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
7742 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7746 for(
size_t k=kbegin; k<kend; ++k ) {
7747 const SIMDType b1(
set( B(k,j) ) );
7748 xmm1 = xmm1 + A.load(i,k) * b1;
7751 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
7755 for( ; remainder && i<iend; ++i )
7757 for(
size_t j=jj; j<jend; ++j )
7759 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
7760 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
7761 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
7762 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
7766 for(
size_t k=kbegin; k<kend; ++k ) {
7767 value += A(i,k) * B(k,j);
7770 (~C)(i,j) += value * scalar;
7793 template<
typename MT3
7797 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7798 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7800 selectLargeAssignKernel( C, A, B, scalar );
7805 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7819 template<
typename MT3
7823 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
7824 selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7826 typedef ElementType_<MT3> ET;
7828 if( IsTriangular<MT4>::value ) {
7830 trmm( C, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7832 else if( IsTriangular<MT5>::value ) {
7834 trmm( C, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
7837 gemm( C, A, B, ET(scalar), ET(0) );
7855 template<
typename MT
7857 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7861 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
7873 const TmpType tmp(
serial( rhs ) );
7874 assign( ~lhs, tmp );
7890 template<
typename MT
7892 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7899 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
7900 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
7902 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7916 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
7931 template<
typename MT3
7935 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7937 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
7938 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7939 selectSmallAddAssignKernel( C, A, B, scalar );
7941 selectBlasAddAssignKernel( C, A, B, scalar );
7959 template<
typename MT3
7963 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
7964 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7966 const ResultType tmp(
serial( A * B * scalar ) );
7967 addAssign( C, tmp );
7985 template<
typename MT3
7989 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
7990 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
7992 const size_t M( A.rows() );
7993 const size_t N( B.columns() );
7995 const size_t block( BLOCK_SIZE );
7997 for(
size_t ii=0UL; ii<M; ii+=block ) {
7998 const size_t iend(
min( M, ii+block ) );
7999 for(
size_t jj=0UL; jj<N; jj+=block ) {
8000 const size_t jend(
min( N, jj+block ) );
8001 for(
size_t i=ii; i<iend; ++i )
8003 const size_t jbegin( ( IsUpper<MT4>::value )
8004 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
8006 const size_t jpos( ( IsLower<MT4>::value )
8007 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
8010 for(
size_t j=jbegin; j<jpos; ++j ) {
8011 (~C)(i,j) += A(i,j) * B(j,j) * scalar;
8033 template<
typename MT3
8037 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
8038 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8040 const size_t M( A.rows() );
8041 const size_t N( B.columns() );
8043 for(
size_t j=0UL; j<N; ++j )
8045 const size_t ibegin( ( IsLower<MT4>::value )
8046 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
8048 const size_t iend( ( IsUpper<MT4>::value )
8049 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
8053 const size_t inum( iend - ibegin );
8054 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
8056 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
8057 (~C)(i ,j) += A(i ,j) * B(j,j) * scalar;
8058 (~C)(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
8061 (~C)(ipos,j) += A(ipos,j) * B(j,j) * scalar;
8081 template<
typename MT3
8085 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
8086 selectDefaultAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8088 const size_t M( A.rows() );
8089 const size_t N( B.columns() );
8091 for(
size_t i=0UL; i<M; ++i )
8093 const size_t jbegin( ( IsUpper<MT5>::value )
8094 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
8096 const size_t jend( ( IsLower<MT5>::value )
8097 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
8101 const size_t jnum( jend - jbegin );
8102 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
8104 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
8105 (~C)(i,j ) += A(i,i) * B(i,j ) * scalar;
8106 (~C)(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
8109 (~C)(i,jpos) += A(i,i) * B(i,jpos) * scalar;
8129 template<
typename MT3
8133 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
8134 selectDefaultAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8136 const size_t M( A.rows() );
8137 const size_t N( B.columns() );
8139 const size_t block( BLOCK_SIZE );
8141 for(
size_t jj=0UL; jj<N; jj+=block ) {
8142 const size_t jend(
min( N, jj+block ) );
8143 for(
size_t ii=0UL; ii<M; ii+=block ) {
8144 const size_t iend(
min( M, ii+block ) );
8145 for(
size_t j=jj; j<jend; ++j )
8147 const size_t ibegin( ( IsLower<MT5>::value )
8148 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
8150 const size_t ipos( ( IsUpper<MT5>::value )
8151 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
8154 for(
size_t i=ibegin; i<ipos; ++i ) {
8155 (~C)(i,j) += A(i,i) * B(i,j) * scalar;
8177 template<
typename MT3
8181 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
8182 selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8184 for(
size_t i=0UL; i<A.rows(); ++i ) {
8185 C(i,i) += A(i,i) * B(i,i) * scalar;
8204 template<
typename MT3
8208 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8209 selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8211 selectDefaultAddAssignKernel( C, A, B, scalar );
8230 template<
typename MT3
8234 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8235 selectSmallAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8237 const size_t M( A.rows() );
8238 const size_t N( B.columns() );
8239 const size_t K( A.columns() );
8241 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8243 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
8246 const SIMDType factor(
set( scalar ) );
8250 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
8251 for(
size_t i=0UL; i<M; ++i )
8253 const size_t kbegin( ( IsUpper<MT4>::value )
8254 ?( ( IsLower<MT5>::value )
8255 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8256 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8257 :( IsLower<MT5>::value ? j : 0UL ) );
8258 const size_t kend( ( IsLower<MT4>::value )
8259 ?( ( IsUpper<MT5>::value )
8260 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
8261 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
8262 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
8264 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8266 for(
size_t k=kbegin; k<kend; ++k ) {
8267 const SIMDType a1(
set( A(i,k) ) );
8268 xmm1 = xmm1 + a1 * B.load(k,j );
8269 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
8270 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
8271 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
8272 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
8273 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
8274 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
8275 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
8278 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8279 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8280 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8281 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8282 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) + xmm5 * factor );
8283 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) + xmm6 * factor );
8284 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) + xmm7 * factor );
8285 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) + xmm8 * factor );
8289 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8293 for( ; (i+2UL) <= M; i+=2UL )
8295 const size_t kbegin( ( IsUpper<MT4>::value )
8296 ?( ( IsLower<MT5>::value )
8297 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8298 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8299 :( IsLower<MT5>::value ? j : 0UL ) );
8300 const size_t kend( ( IsLower<MT4>::value )
8301 ?( ( IsUpper<MT5>::value )
8302 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
8303 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8304 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
8306 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8308 for(
size_t k=kbegin; k<kend; ++k ) {
8309 const SIMDType a1(
set( A(i ,k) ) );
8310 const SIMDType a2(
set( A(i+1UL,k) ) );
8311 const SIMDType b1( B.load(k,j ) );
8312 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
8313 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
8314 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
8315 xmm1 = xmm1 + a1 * b1;
8316 xmm2 = xmm2 + a1 * b2;
8317 xmm3 = xmm3 + a1 * b3;
8318 xmm4 = xmm4 + a1 * b4;
8319 xmm5 = xmm5 + a2 * b1;
8320 xmm6 = xmm6 + a2 * b2;
8321 xmm7 = xmm7 + a2 * b3;
8322 xmm8 = xmm8 + a2 * b4;
8325 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8326 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) + xmm2 * factor );
8327 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) + xmm3 * factor );
8328 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) + xmm4 * factor );
8329 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8330 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) + xmm6 * factor );
8331 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) + xmm7 * factor );
8332 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) + xmm8 * factor );
8337 const size_t kbegin( ( IsUpper<MT4>::value )
8338 ?( ( IsLower<MT5>::value )
8339 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8340 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8341 :( IsLower<MT5>::value ? j : 0UL ) );
8342 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
8344 SIMDType xmm1, xmm2, xmm3, xmm4;
8346 for(
size_t k=kbegin; k<kend; ++k ) {
8347 const SIMDType a1(
set( A(i,k) ) );
8348 xmm1 = xmm1 + a1 * B.load(k,j );
8349 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
8350 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
8351 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
8354 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8355 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) + xmm2 * factor );
8356 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) + xmm3 * factor );
8357 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) + xmm4 * factor );
8361 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8365 for( ; (i+2UL) <= M; i+=2UL )
8367 const size_t kbegin( ( IsUpper<MT4>::value )
8368 ?( ( IsLower<MT5>::value )
8369 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8370 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8371 :( IsLower<MT5>::value ? j : 0UL ) );
8372 const size_t kend( ( IsLower<MT4>::value )
8373 ?( ( IsUpper<MT5>::value )
8374 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
8375 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
8376 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
8378 SIMDType xmm1, xmm2, xmm3, xmm4;
8380 for(
size_t k=kbegin; k<kend; ++k ) {
8381 const SIMDType a1(
set( A(i ,k) ) );
8382 const SIMDType a2(
set( A(i+1UL,k) ) );
8383 const SIMDType b1( B.load(k,j ) );
8384 const SIMDType b2( B.load(k,j+SIMDSIZE) );
8385 xmm1 = xmm1 + a1 * b1;
8386 xmm2 = xmm2 + a1 * b2;
8387 xmm3 = xmm3 + a2 * b1;
8388 xmm4 = xmm4 + a2 * b2;
8391 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8392 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) + xmm2 * factor );
8393 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8394 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) + xmm4 * factor );
8399 const size_t kbegin( ( IsUpper<MT4>::value )
8400 ?( ( IsLower<MT5>::value )
8401 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8402 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8403 :( IsLower<MT5>::value ? j : 0UL ) );
8404 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
8406 SIMDType xmm1, xmm2;
8408 for(
size_t k=kbegin; k<kend; ++k ) {
8409 const SIMDType a1(
set( A(i,k) ) );
8410 xmm1 = xmm1 + a1 * B.load(k,j );
8411 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
8414 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8415 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) + xmm2 * factor );
8419 for( ; j<jpos; j+=SIMDSIZE )
8423 for( ; (i+2UL) <= M; i+=2UL )
8425 const size_t kbegin( ( IsUpper<MT4>::value )
8426 ?( ( IsLower<MT5>::value )
8427 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8428 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8429 :( IsLower<MT5>::value ? j : 0UL ) );
8430 const size_t kend( ( IsLower<MT4>::value )
8431 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8434 SIMDType xmm1, xmm2;
8436 for(
size_t k=kbegin; k<kend; ++k ) {
8437 const SIMDType b1( B.load(k,j) );
8438 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
8439 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
8442 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8443 (~C).store( i+1UL, j, (~C).load(i+1UL,j) + xmm2 * factor );
8448 const size_t kbegin( ( IsUpper<MT4>::value )
8449 ?( ( IsLower<MT5>::value )
8450 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8451 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8452 :( IsLower<MT5>::value ? j : 0UL ) );
8456 for(
size_t k=kbegin; k<K; ++k ) {
8457 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
8460 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8464 for( ; remainder && j<N; ++j )
8468 for( ; (i+2UL) <= M; i+=2UL )
8470 const size_t kbegin( ( IsUpper<MT4>::value )
8471 ?( ( IsLower<MT5>::value )
8472 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8473 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8474 :( IsLower<MT5>::value ? j : 0UL ) );
8475 const size_t kend( ( IsLower<MT4>::value )
8476 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
8482 for(
size_t k=kbegin; k<kend; ++k ) {
8483 value1 += A(i ,k) * B(k,j);
8484 value2 += A(i+1UL,k) * B(k,j);
8487 (~C)(i ,j) += value1 * scalar;
8488 (~C)(i+1UL,j) += value2 * scalar;
8493 const size_t kbegin( ( IsUpper<MT4>::value )
8494 ?( ( IsLower<MT5>::value )
8495 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
8496 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
8497 :( IsLower<MT5>::value ? j : 0UL ) );
8501 for(
size_t k=kbegin; k<K; ++k ) {
8502 value += A(i,k) * B(k,j);
8505 (~C)(i,j) += value * scalar;
8526 template<
typename MT3
8530 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8531 selectSmallAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8533 const size_t M( A.rows() );
8534 const size_t N( B.columns() );
8535 const size_t K( A.columns() );
8537 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
8539 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
8542 const SIMDType factor(
set( scalar ) );
8546 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
8547 for(
size_t j=0UL; j<N; ++j )
8549 const size_t kbegin( ( IsLower<MT5>::value )
8550 ?( ( IsUpper<MT4>::value )
8551 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8552 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8553 :( IsUpper<MT4>::value ? i : 0UL ) );
8554 const size_t kend( ( IsUpper<MT5>::value )
8555 ?( ( IsLower<MT4>::value )
8556 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
8557 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
8558 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
8560 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8562 for(
size_t k=kbegin; k<kend; ++k ) {
8563 const SIMDType b1(
set( B(k,j) ) );
8564 xmm1 = xmm1 + A.load(i ,k) * b1;
8565 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
8566 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
8567 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
8568 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
8569 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
8570 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
8571 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
8574 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8575 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8576 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8577 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8578 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) + xmm5 * factor );
8579 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) + xmm6 * factor );
8580 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) + xmm7 * factor );
8581 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) + xmm8 * factor );
8585 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
8589 for( ; (j+2UL) <= N; j+=2UL )
8591 const size_t kbegin( ( IsLower<MT5>::value )
8592 ?( ( IsUpper<MT4>::value )
8593 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8594 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8595 :( IsUpper<MT4>::value ? i : 0UL ) );
8596 const size_t kend( ( IsUpper<MT5>::value )
8597 ?( ( IsLower<MT4>::value )
8598 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8599 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8600 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
8602 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8604 for(
size_t k=kbegin; k<kend; ++k ) {
8605 const SIMDType a1( A.load(i ,k) );
8606 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
8607 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
8608 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
8609 const SIMDType b1(
set( B(k,j ) ) );
8610 const SIMDType b2(
set( B(k,j+1UL) ) );
8611 xmm1 = xmm1 + a1 * b1;
8612 xmm2 = xmm2 + a2 * b1;
8613 xmm3 = xmm3 + a3 * b1;
8614 xmm4 = xmm4 + a4 * b1;
8615 xmm5 = xmm5 + a1 * b2;
8616 xmm6 = xmm6 + a2 * b2;
8617 xmm7 = xmm7 + a3 * b2;
8618 xmm8 = xmm8 + a4 * b2;
8621 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8622 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) + xmm2 * factor );
8623 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) + xmm3 * factor );
8624 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) + xmm4 * factor );
8625 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
8626 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) + xmm6 * factor );
8627 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
8628 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
8633 const size_t kbegin( ( IsLower<MT5>::value )
8634 ?( ( IsUpper<MT4>::value )
8635 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8636 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8637 :( IsUpper<MT4>::value ? i : 0UL ) );
8638 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
8640 SIMDType xmm1, xmm2, xmm3, xmm4;
8642 for(
size_t k=kbegin; k<kend; ++k ) {
8643 const SIMDType b1(
set( B(k,j) ) );
8644 xmm1 = xmm1 + A.load(i ,k) * b1;
8645 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
8646 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
8647 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
8650 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8651 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) + xmm2 * factor );
8652 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) + xmm3 * factor );
8653 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) + xmm4 * factor );
8657 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
8661 for( ; (j+2UL) <= N; j+=2UL )
8663 const size_t kbegin( ( IsLower<MT5>::value )
8664 ?( ( IsUpper<MT4>::value )
8665 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8666 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8667 :( IsUpper<MT4>::value ? i : 0UL ) );
8668 const size_t kend( ( IsUpper<MT5>::value )
8669 ?( ( IsLower<MT4>::value )
8670 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
8671 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
8672 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
8674 SIMDType xmm1, xmm2, xmm3, xmm4;
8676 for(
size_t k=kbegin; k<kend; ++k ) {
8677 const SIMDType a1( A.load(i ,k) );
8678 const SIMDType a2( A.load(i+SIMDSIZE,k) );
8679 const SIMDType b1(
set( B(k,j ) ) );
8680 const SIMDType b2(
set( B(k,j+1UL) ) );
8681 xmm1 = xmm1 + a1 * b1;
8682 xmm2 = xmm2 + a2 * b1;
8683 xmm3 = xmm3 + a1 * b2;
8684 xmm4 = xmm4 + a2 * b2;
8687 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8688 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) + xmm2 * factor );
8689 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
8690 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) + xmm4 * factor );
8695 const size_t kbegin( ( IsLower<MT5>::value )
8696 ?( ( IsUpper<MT4>::value )
8697 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8698 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8699 :( IsUpper<MT4>::value ? i : 0UL ) );
8700 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
8702 SIMDType xmm1, xmm2;
8704 for(
size_t k=kbegin; k<kend; ++k ) {
8705 const SIMDType b1(
set( B(k,j) ) );
8706 xmm1 = xmm1 + A.load(i ,k) * b1;
8707 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
8710 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
8711 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) + xmm2 * factor );
8715 for( ; i<ipos; i+=SIMDSIZE )
8719 for( ; (j+2UL) <= N; j+=2UL )
8721 const size_t kbegin( ( IsLower<MT5>::value )
8722 ?( ( IsUpper<MT4>::value )
8723 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8724 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8725 :( IsUpper<MT4>::value ? i : 0UL ) );
8726 const size_t kend( ( IsUpper<MT5>::value )
8727 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8730 SIMDType xmm1, xmm2;
8732 for(
size_t k=kbegin; k<kend; ++k ) {
8733 const SIMDType a1( A.load(i,k) );
8734 xmm1 = xmm1 + a1 *
set( B(k,j ) );
8735 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
8738 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8739 (~C).store( i, j+1UL, (~C).load(i,j+1UL) + xmm2 * factor );
8744 const size_t kbegin( ( IsLower<MT5>::value )
8745 ?( ( IsUpper<MT4>::value )
8746 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8747 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8748 :( IsUpper<MT4>::value ? i : 0UL ) );
8752 for(
size_t k=kbegin; k<K; ++k ) {
8753 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
8756 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
8760 for( ; remainder && i<M; ++i )
8764 for( ; (j+2UL) <= N; j+=2UL )
8766 const size_t kbegin( ( IsLower<MT5>::value )
8767 ?( ( IsUpper<MT4>::value )
8768 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8769 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8770 :( IsUpper<MT4>::value ? i : 0UL ) );
8771 const size_t kend( ( IsUpper<MT5>::value )
8772 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
8778 for(
size_t k=kbegin; k<kend; ++k ) {
8779 value1 += A(i,k) * B(k,j );
8780 value2 += A(i,k) * B(k,j+1UL);
8783 (~C)(i,j ) += value1 * scalar;
8784 (~C)(i,j+1UL) += value2 * scalar;
8789 const size_t kbegin( ( IsLower<MT5>::value )
8790 ?( ( IsUpper<MT4>::value )
8791 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
8792 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
8793 :( IsUpper<MT4>::value ? i : 0UL ) );
8797 for(
size_t k=kbegin; k<K; ++k ) {
8798 value += A(i,k) * B(k,j);
8801 (~C)(i,j) += value * scalar;
8821 template<
typename MT3
8825 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8826 selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8828 selectDefaultAddAssignKernel( C, A, B, scalar );
8847 template<
typename MT3
8851 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
8852 selectLargeAddAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
8854 const size_t M( A.rows() );
8855 const size_t N( B.columns() );
8856 const size_t K( A.columns() );
8858 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
8860 const SIMDType factor(
set( scalar ) );
8862 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
8864 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
8866 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
8867 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
8869 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
8871 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
8873 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
8875 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
8879 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
8881 const size_t j1( j+SIMDSIZE );
8882 const size_t j2( j+SIMDSIZE*2UL );
8883 const size_t j3( j+SIMDSIZE*3UL );
8887 for( ; (i+2UL) <= iend; i+=2UL )
8889 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8890 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8891 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8892 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
8894 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8896 for(
size_t k=kbegin; k<kend; ++k ) {
8897 const SIMDType a1(
set( A(i ,k) ) );
8898 const SIMDType a2(
set( A(i+1UL,k) ) );
8899 const SIMDType b1( B.load(k,j ) );
8900 const SIMDType b2( B.load(k,j1) );
8901 const SIMDType b3( B.load(k,j2) );
8902 const SIMDType b4( B.load(k,j3) );
8903 xmm1 = xmm1 + a1 * b1;
8904 xmm2 = xmm2 + a1 * b2;
8905 xmm3 = xmm3 + a1 * b3;
8906 xmm4 = xmm4 + a1 * b4;
8907 xmm5 = xmm5 + a2 * b1;
8908 xmm6 = xmm6 + a2 * b2;
8909 xmm7 = xmm7 + a2 * b3;
8910 xmm8 = xmm8 + a2 * b4;
8913 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8914 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8915 (~C).store( i , j2, (~C).load(i ,j2) + xmm3 * factor );
8916 (~C).store( i , j3, (~C).load(i ,j3) + xmm4 * factor );
8917 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm5 * factor );
8918 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm6 * factor );
8919 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) + xmm7 * factor );
8920 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) + xmm8 * factor );
8925 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8926 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8927 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
8928 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
8930 SIMDType xmm1, xmm2, xmm3, xmm4;
8932 for(
size_t k=kbegin; k<kend; ++k ) {
8933 const SIMDType a1(
set( A(i,k) ) );
8934 xmm1 = xmm1 + a1 * B.load(k,j );
8935 xmm2 = xmm2 + a1 * B.load(k,j1);
8936 xmm3 = xmm3 + a1 * B.load(k,j2);
8937 xmm4 = xmm4 + a1 * B.load(k,j3);
8940 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
8941 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
8942 (~C).store( i, j2, (~C).load(i,j2) + xmm3 * factor );
8943 (~C).store( i, j3, (~C).load(i,j3) + xmm4 * factor );
8947 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
8949 const size_t j1( j+SIMDSIZE );
8953 for( ; (i+4UL) <= iend; i+=4UL )
8955 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8956 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8957 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
8958 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
8960 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8962 for(
size_t k=kbegin; k<kend; ++k ) {
8963 const SIMDType a1(
set( A(i ,k) ) );
8964 const SIMDType a2(
set( A(i+1UL,k) ) );
8965 const SIMDType a3(
set( A(i+2UL,k) ) );
8966 const SIMDType a4(
set( A(i+3UL,k) ) );
8967 const SIMDType b1( B.load(k,j ) );
8968 const SIMDType b2( B.load(k,j1) );
8969 xmm1 = xmm1 + a1 * b1;
8970 xmm2 = xmm2 + a1 * b2;
8971 xmm3 = xmm3 + a2 * b1;
8972 xmm4 = xmm4 + a2 * b2;
8973 xmm5 = xmm5 + a3 * b1;
8974 xmm6 = xmm6 + a3 * b2;
8975 xmm7 = xmm7 + a4 * b1;
8976 xmm8 = xmm8 + a4 * b2;
8979 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
8980 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
8981 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
8982 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
8983 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) + xmm5 * factor );
8984 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) + xmm6 * factor );
8985 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) + xmm7 * factor );
8986 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) + xmm8 * factor );
8989 for( ; (i+2UL) <= iend; i+=2UL )
8991 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
8992 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
8993 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
8994 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
8996 SIMDType xmm1, xmm2, xmm3, xmm4;
8998 for(
size_t k=kbegin; k<kend; ++k ) {
8999 const SIMDType a1(
set( A(i ,k) ) );
9000 const SIMDType a2(
set( A(i+1UL,k) ) );
9001 const SIMDType b1( B.load(k,j ) );
9002 const SIMDType b2( B.load(k,j1) );
9003 xmm1 = xmm1 + a1 * b1;
9004 xmm2 = xmm2 + a1 * b2;
9005 xmm3 = xmm3 + a2 * b1;
9006 xmm4 = xmm4 + a2 * b2;
9009 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9010 (~C).store( i , j1, (~C).load(i ,j1) + xmm2 * factor );
9011 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) + xmm3 * factor );
9012 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) + xmm4 * factor );
9017 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9018 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9019 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9020 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
9022 SIMDType xmm1, xmm2;
9024 for(
size_t k=kbegin; k<kend; ++k ) {
9025 const SIMDType a1(
set( A(i,k) ) );
9026 xmm1 = xmm1 + a1 * B.load(k,j );
9027 xmm2 = xmm2 + a1 * B.load(k,j1);
9030 (~C).store( i, j , (~C).load(i,j ) + xmm1 * factor );
9031 (~C).store( i, j1, (~C).load(i,j1) + xmm2 * factor );
9035 for( ; j<jpos; j+=SIMDSIZE )
9037 for(
size_t i=ii; i<iend; ++i )
9039 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9040 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9041 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9042 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
9046 for(
size_t k=kbegin; k<kend; ++k ) {
9047 const SIMDType a1(
set( A(i,k) ) );
9048 xmm1 = xmm1 + a1 * B.load(k,j);
9051 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9055 for( ; remainder && j<jend; ++j )
9057 for(
size_t i=ii; i<iend; ++i )
9059 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9060 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9061 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
9062 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
9066 for(
size_t k=kbegin; k<kend; ++k ) {
9067 value += A(i,k) * B(k,j);
9070 (~C)(i,j) += value * scalar;
9094 template<
typename MT3
9098 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9099 selectLargeAddAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9101 const size_t M( A.rows() );
9102 const size_t N( B.columns() );
9103 const size_t K( A.columns() );
9105 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
9107 const SIMDType factor(
set( scalar ) );
9109 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
9111 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
9113 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
9114 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
9116 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
9118 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
9120 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
9122 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
9126 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
9128 const size_t i1( i+SIMDSIZE );
9129 const size_t i2( i+SIMDSIZE*2UL );
9130 const size_t i3( i+SIMDSIZE*3UL );
9134 for( ; (j+2UL) <= jend; j+=2UL )
9136 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9137 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9138 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
9139 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9141 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9143 for(
size_t k=kbegin; k<kend; ++k ) {
9144 const SIMDType a1( A.load(i ,k) );
9145 const SIMDType a2( A.load(i1,k) );
9146 const SIMDType a3( A.load(i2,k) );
9147 const SIMDType a4( A.load(i3,k) );
9148 const SIMDType b1(
set( B(k,j ) ) );
9149 const SIMDType b2(
set( B(k,j+1UL) ) );
9150 xmm1 = xmm1 + a1 * b1;
9151 xmm2 = xmm2 + a2 * b1;
9152 xmm3 = xmm3 + a3 * b1;
9153 xmm4 = xmm4 + a4 * b1;
9154 xmm5 = xmm5 + a1 * b2;
9155 xmm6 = xmm6 + a2 * b2;
9156 xmm7 = xmm7 + a3 * b2;
9157 xmm8 = xmm8 + a4 * b2;
9160 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9161 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9162 (~C).store( i2, j , (~C).load(i2,j ) + xmm3 * factor );
9163 (~C).store( i3, j , (~C).load(i3,j ) + xmm4 * factor );
9164 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm5 * factor );
9165 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm6 * factor );
9166 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) + xmm7 * factor );
9167 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) + xmm8 * factor );
9172 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9173 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9174 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
9175 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9177 SIMDType xmm1, xmm2, xmm3, xmm4;
9179 for(
size_t k=kbegin; k<kend; ++k ) {
9180 const SIMDType b1(
set( B(k,j) ) );
9181 xmm1 = xmm1 + A.load(i ,k) * b1;
9182 xmm2 = xmm2 + A.load(i1,k) * b1;
9183 xmm3 = xmm3 + A.load(i2,k) * b1;
9184 xmm4 = xmm4 + A.load(i3,k) * b1;
9187 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9188 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9189 (~C).store( i2, j, (~C).load(i2,j) + xmm3 * factor );
9190 (~C).store( i3, j, (~C).load(i3,j) + xmm4 * factor );
9194 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
9196 const size_t i1( i+SIMDSIZE );
9200 for( ; (j+4UL) <= jend; j+=4UL )
9202 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9203 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9204 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9205 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
9207 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9209 for(
size_t k=kbegin; k<kend; ++k ) {
9210 const SIMDType a1( A.load(i ,k) );
9211 const SIMDType a2( A.load(i1,k) );
9212 const SIMDType b1(
set( B(k,j ) ) );
9213 const SIMDType b2(
set( B(k,j+1UL) ) );
9214 const SIMDType b3(
set( B(k,j+2UL) ) );
9215 const SIMDType b4(
set( B(k,j+3UL) ) );
9216 xmm1 = xmm1 + a1 * b1;
9217 xmm2 = xmm2 + a2 * b1;
9218 xmm3 = xmm3 + a1 * b2;
9219 xmm4 = xmm4 + a2 * b2;
9220 xmm5 = xmm5 + a1 * b3;
9221 xmm6 = xmm6 + a2 * b3;
9222 xmm7 = xmm7 + a1 * b4;
9223 xmm8 = xmm8 + a2 * b4;
9226 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9227 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9228 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9229 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9230 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) + xmm5 * factor );
9231 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) + xmm6 * factor );
9232 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) + xmm7 * factor );
9233 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) + xmm8 * factor );
9236 for( ; (j+2UL) <= jend; j+=2UL )
9238 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9239 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9240 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9241 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
9243 SIMDType xmm1, xmm2, xmm3, xmm4;
9245 for(
size_t k=kbegin; k<kend; ++k ) {
9246 const SIMDType a1( A.load(i ,k) );
9247 const SIMDType a2( A.load(i1,k) );
9248 const SIMDType b1(
set( B(k,j ) ) );
9249 const SIMDType b2(
set( B(k,j+1UL) ) );
9250 xmm1 = xmm1 + a1 * b1;
9251 xmm2 = xmm2 + a2 * b1;
9252 xmm3 = xmm3 + a1 * b2;
9253 xmm4 = xmm4 + a2 * b2;
9256 (~C).store( i , j , (~C).load(i ,j ) + xmm1 * factor );
9257 (~C).store( i1, j , (~C).load(i1,j ) + xmm2 * factor );
9258 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) + xmm3 * factor );
9259 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) + xmm4 * factor );
9264 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9265 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9266 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
9267 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9269 SIMDType xmm1, xmm2;
9271 for(
size_t k=kbegin; k<kend; ++k ) {
9272 const SIMDType b1(
set( B(k,j) ) );
9273 xmm1 = xmm1 + A.load(i ,k) * b1;
9274 xmm2 = xmm2 + A.load(i1,k) * b1;
9277 (~C).store( i , j, (~C).load(i ,j) + xmm1 * factor );
9278 (~C).store( i1, j, (~C).load(i1,j) + xmm2 * factor );
9282 for( ; i<ipos; i+=SIMDSIZE )
9284 for(
size_t j=jj; j<jend; ++j )
9286 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9287 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9288 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
9289 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9293 for(
size_t k=kbegin; k<kend; ++k ) {
9294 const SIMDType b1(
set( B(k,j) ) );
9295 xmm1 = xmm1 + A.load(i,k) * b1;
9298 (~C).store( i, j, (~C).load(i,j) + xmm1 * factor );
9302 for( ; remainder && i<iend; ++i )
9304 for(
size_t j=jj; j<jend; ++j )
9306 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
9307 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
9308 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
9309 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
9313 for(
size_t k=kbegin; k<kend; ++k ) {
9314 value += A(i,k) * B(k,j);
9317 (~C)(i,j) += value * scalar;
9340 template<
typename MT3
9344 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
9345 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9347 selectLargeAddAssignKernel( C, A, B, scalar );
9352 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9366 template<
typename MT3
9370 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
9371 selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9373 typedef ElementType_<MT3> ET;
9375 if( IsTriangular<MT4>::value ) {
9376 ResultType_<MT3> tmp(
serial( B ) );
9377 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9378 addAssign( C, tmp );
9380 else if( IsTriangular<MT5>::value ) {
9381 ResultType_<MT3> tmp(
serial( A ) );
9382 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
9383 addAssign( C, tmp );
9386 gemm( C, A, B, ET(scalar), ET(1) );
9408 template<
typename MT
9410 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9417 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
9418 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
9420 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9434 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
9449 template<
typename MT3
9453 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9455 if( ( IsDiagonal<MT4>::value && IsDiagonal<MT5>::value ) ||
9456 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9457 selectSmallSubAssignKernel( C, A, B, scalar );
9459 selectBlasSubAssignKernel( C, A, B, scalar );
9477 template<
typename MT3
9481 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, Not< IsDiagonal<MT5> > > >
9482 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9484 const ResultType tmp(
serial( A * B * scalar ) );
9485 subAssign( C, tmp );
9503 template<
typename MT3
9507 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9508 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9510 const size_t M( A.rows() );
9511 const size_t N( B.columns() );
9513 const size_t block( BLOCK_SIZE );
9515 for(
size_t ii=0UL; ii<M; ii+=block ) {
9516 const size_t iend(
min( M, ii+block ) );
9517 for(
size_t jj=0UL; jj<N; jj+=block ) {
9518 const size_t jend(
min( N, jj+block ) );
9519 for(
size_t i=ii; i<iend; ++i )
9521 const size_t jbegin( ( IsUpper<MT4>::value )
9522 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), jj ) )
9524 const size_t jpos( ( IsLower<MT4>::value )
9525 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), jend ) )
9528 for(
size_t j=jbegin; j<jpos; ++j ) {
9529 (~C)(i,j) -= A(i,j) * B(j,j) * scalar;
9551 template<
typename MT3
9555 static inline EnableIf_< And< Not< IsDiagonal<MT4> >, IsDiagonal<MT5> > >
9556 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9558 const size_t M( A.rows() );
9559 const size_t N( B.columns() );
9561 for(
size_t j=0UL; j<N; ++j )
9563 const size_t ibegin( ( IsLower<MT4>::value )
9564 ?( IsStrictlyLower<MT4>::value ? j+1UL : j )
9566 const size_t iend( ( IsUpper<MT4>::value )
9567 ?( IsStrictlyUpper<MT4>::value ? j : j+1UL )
9571 const size_t inum( iend - ibegin );
9572 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9574 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9575 (~C)(i ,j) -= A(i ,j) * B(j,j) * scalar;
9576 (~C)(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
9579 (~C)(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
9599 template<
typename MT3
9603 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9604 selectDefaultSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9606 const size_t M( A.rows() );
9607 const size_t N( B.columns() );
9609 for(
size_t i=0UL; i<M; ++i )
9611 const size_t jbegin( ( IsUpper<MT5>::value )
9612 ?( IsStrictlyUpper<MT5>::value ? i+1UL : i )
9614 const size_t jend( ( IsLower<MT5>::value )
9615 ?( IsStrictlyLower<MT5>::value ? i : i+1UL )
9619 const size_t jnum( jend - jbegin );
9620 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9622 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9623 (~C)(i,j ) -= A(i,i) * B(i,j ) * scalar;
9624 (~C)(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
9627 (~C)(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
9647 template<
typename MT3
9651 static inline EnableIf_< And< IsDiagonal<MT4>, Not< IsDiagonal<MT5> > > >
9652 selectDefaultSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9654 const size_t M( A.rows() );
9655 const size_t N( B.columns() );
9657 const size_t block( BLOCK_SIZE );
9659 for(
size_t jj=0UL; jj<N; jj+=block ) {
9660 const size_t jend(
min( N, jj+block ) );
9661 for(
size_t ii=0UL; ii<M; ii+=block ) {
9662 const size_t iend(
min( M, ii+block ) );
9663 for(
size_t j=jj; j<jend; ++j )
9665 const size_t ibegin( ( IsLower<MT5>::value )
9666 ?(
max( ( IsStrictlyLower<MT5>::value ? j+1UL : j ), ii ) )
9668 const size_t ipos( ( IsUpper<MT5>::value )
9669 ?(
min( ( IsStrictlyUpper<MT5>::value ? j : j+1UL ), iend ) )
9672 for(
size_t i=ibegin; i<ipos; ++i ) {
9673 (~C)(i,j) -= A(i,i) * B(i,j) * scalar;
9695 template<
typename MT3
9699 static inline EnableIf_< And< IsDiagonal<MT4>, IsDiagonal<MT5> > >
9700 selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9702 for(
size_t i=0UL; i<A.rows(); ++i ) {
9703 C(i,i) -= A(i,i) * B(i,i) * scalar;
9722 template<
typename MT3
9726 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9727 selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9729 selectDefaultSubAssignKernel( C, A, B, scalar );
9748 template<
typename MT3
9752 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
9753 selectSmallSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
9755 const size_t M( A.rows() );
9756 const size_t N( B.columns() );
9757 const size_t K( A.columns() );
9759 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
9761 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
9764 const SIMDType factor(
set( scalar ) );
9768 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL ) {
9769 for(
size_t i=0UL; i<M; ++i )
9771 const size_t kbegin( ( IsUpper<MT4>::value )
9772 ?( ( IsLower<MT5>::value )
9773 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9774 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9775 :( IsLower<MT5>::value ? j : 0UL ) );
9776 const size_t kend( ( IsLower<MT4>::value )
9777 ?( ( IsUpper<MT5>::value )
9778 ?(
min( ( IsStrictlyLower<MT4>::value ? i : i+1UL ), j+SIMDSIZE*8UL, K ) )
9779 :( IsStrictlyLower<MT4>::value ? i : i+1UL ) )
9780 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*8UL, K ) : K ) );
9782 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9784 for(
size_t k=kbegin; k<kend; ++k ) {
9785 const SIMDType a1(
set( A(i,k) ) );
9786 xmm1 = xmm1 + a1 * B.load(k,j );
9787 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
9788 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
9789 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
9790 xmm5 = xmm5 + a1 * B.load(k,j+SIMDSIZE*4UL);
9791 xmm6 = xmm6 + a1 * B.load(k,j+SIMDSIZE*5UL);
9792 xmm7 = xmm7 + a1 * B.load(k,j+SIMDSIZE*6UL);
9793 xmm8 = xmm8 + a1 * B.load(k,j+SIMDSIZE*7UL);
9796 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9797 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9798 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9799 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9800 (~C).store( i, j+SIMDSIZE*4UL, (~C).load(i,j+SIMDSIZE*4UL) - xmm5 * factor );
9801 (~C).store( i, j+SIMDSIZE*5UL, (~C).load(i,j+SIMDSIZE*5UL) - xmm6 * factor );
9802 (~C).store( i, j+SIMDSIZE*6UL, (~C).load(i,j+SIMDSIZE*6UL) - xmm7 * factor );
9803 (~C).store( i, j+SIMDSIZE*7UL, (~C).load(i,j+SIMDSIZE*7UL) - xmm8 * factor );
9807 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
9811 for( ; (i+2UL) <= M; i+=2UL )
9813 const size_t kbegin( ( IsUpper<MT4>::value )
9814 ?( ( IsLower<MT5>::value )
9815 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9816 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9817 :( IsLower<MT5>::value ? j : 0UL ) );
9818 const size_t kend( ( IsLower<MT4>::value )
9819 ?( ( IsUpper<MT5>::value )
9820 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*4UL, K ) )
9821 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9822 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*4UL, K ) : K ) );
9824 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9826 for(
size_t k=kbegin; k<kend; ++k ) {
9827 const SIMDType a1(
set( A(i ,k) ) );
9828 const SIMDType a2(
set( A(i+1UL,k) ) );
9829 const SIMDType b1( B.load(k,j ) );
9830 const SIMDType b2( B.load(k,j+SIMDSIZE ) );
9831 const SIMDType b3( B.load(k,j+SIMDSIZE*2UL) );
9832 const SIMDType b4( B.load(k,j+SIMDSIZE*3UL) );
9833 xmm1 = xmm1 + a1 * b1;
9834 xmm2 = xmm2 + a1 * b2;
9835 xmm3 = xmm3 + a1 * b3;
9836 xmm4 = xmm4 + a1 * b4;
9837 xmm5 = xmm5 + a2 * b1;
9838 xmm6 = xmm6 + a2 * b2;
9839 xmm7 = xmm7 + a2 * b3;
9840 xmm8 = xmm8 + a2 * b4;
9843 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9844 (~C).store( i , j+SIMDSIZE , (~C).load(i ,j+SIMDSIZE ) - xmm2 * factor );
9845 (~C).store( i , j+SIMDSIZE*2UL, (~C).load(i ,j+SIMDSIZE*2UL) - xmm3 * factor );
9846 (~C).store( i , j+SIMDSIZE*3UL, (~C).load(i ,j+SIMDSIZE*3UL) - xmm4 * factor );
9847 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
9848 (~C).store( i+1UL, j+SIMDSIZE , (~C).load(i+1UL,j+SIMDSIZE ) - xmm6 * factor );
9849 (~C).store( i+1UL, j+SIMDSIZE*2UL, (~C).load(i+1UL,j+SIMDSIZE*2UL) - xmm7 * factor );
9850 (~C).store( i+1UL, j+SIMDSIZE*3UL, (~C).load(i+1UL,j+SIMDSIZE*3UL) - xmm8 * factor );
9855 const size_t kbegin( ( IsUpper<MT4>::value )
9856 ?( ( IsLower<MT5>::value )
9857 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9858 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9859 :( IsLower<MT5>::value ? j : 0UL ) );
9860 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, K ) ):( K ) );
9862 SIMDType xmm1, xmm2, xmm3, xmm4;
9864 for(
size_t k=kbegin; k<kend; ++k ) {
9865 const SIMDType a1(
set( A(i,k) ) );
9866 xmm1 = xmm1 + a1 * B.load(k,j );
9867 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE );
9868 xmm3 = xmm3 + a1 * B.load(k,j+SIMDSIZE*2UL);
9869 xmm4 = xmm4 + a1 * B.load(k,j+SIMDSIZE*3UL);
9872 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9873 (~C).store( i, j+SIMDSIZE , (~C).load(i,j+SIMDSIZE ) - xmm2 * factor );
9874 (~C).store( i, j+SIMDSIZE*2UL, (~C).load(i,j+SIMDSIZE*2UL) - xmm3 * factor );
9875 (~C).store( i, j+SIMDSIZE*3UL, (~C).load(i,j+SIMDSIZE*3UL) - xmm4 * factor );
9879 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
9883 for( ; (i+2UL) <= M; i+=2UL )
9885 const size_t kbegin( ( IsUpper<MT4>::value )
9886 ?( ( IsLower<MT5>::value )
9887 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9888 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9889 :( IsLower<MT5>::value ? j : 0UL ) );
9890 const size_t kend( ( IsLower<MT4>::value )
9891 ?( ( IsUpper<MT5>::value )
9892 ?(
min( ( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ), j+SIMDSIZE*2UL, K ) )
9893 :( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL ) )
9894 :( IsUpper<MT5>::value ?
min( j+SIMDSIZE*2UL, K ) : K ) );
9896 SIMDType xmm1, xmm2, xmm3, xmm4;
9898 for(
size_t k=kbegin; k<kend; ++k ) {
9899 const SIMDType a1(
set( A(i ,k) ) );
9900 const SIMDType a2(
set( A(i+1UL,k) ) );
9901 const SIMDType b1( B.load(k,j ) );
9902 const SIMDType b2( B.load(k,j+SIMDSIZE) );
9903 xmm1 = xmm1 + a1 * b1;
9904 xmm2 = xmm2 + a1 * b2;
9905 xmm3 = xmm3 + a2 * b1;
9906 xmm4 = xmm4 + a2 * b2;
9909 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
9910 (~C).store( i , j+SIMDSIZE, (~C).load(i ,j+SIMDSIZE) - xmm2 * factor );
9911 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
9912 (~C).store( i+1UL, j+SIMDSIZE, (~C).load(i+1UL,j+SIMDSIZE) - xmm4 * factor );
9917 const size_t kbegin( ( IsUpper<MT4>::value )
9918 ?( ( IsLower<MT5>::value )
9919 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9920 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9921 :( IsLower<MT5>::value ? j : 0UL ) );
9922 const size_t kend( ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, K ) ):( K ) );
9924 SIMDType xmm1, xmm2;
9926 for(
size_t k=kbegin; k<kend; ++k ) {
9927 const SIMDType a1(
set( A(i,k) ) );
9928 xmm1 = xmm1 + a1 * B.load(k,j );
9929 xmm2 = xmm2 + a1 * B.load(k,j+SIMDSIZE);
9932 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
9933 (~C).store( i, j+SIMDSIZE, (~C).load(i,j+SIMDSIZE) - xmm2 * factor );
9937 for( ; j<jpos; j+=SIMDSIZE )
9941 for( ; (i+2UL) <= M; i+=2UL )
9943 const size_t kbegin( ( IsUpper<MT4>::value )
9944 ?( ( IsLower<MT5>::value )
9945 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9946 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9947 :( IsLower<MT5>::value ? j : 0UL ) );
9948 const size_t kend( ( IsLower<MT4>::value )
9949 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
9952 SIMDType xmm1, xmm2;
9954 for(
size_t k=kbegin; k<kend; ++k ) {
9955 const SIMDType b1( B.load(k,j) );
9956 xmm1 = xmm1 +
set( A(i ,k) ) * b1;
9957 xmm2 = xmm2 +
set( A(i+1UL,k) ) * b1;
9960 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
9961 (~C).store( i+1UL, j, (~C).load(i+1UL,j) - xmm2 * factor );
9966 const size_t kbegin( ( IsUpper<MT4>::value )
9967 ?( ( IsLower<MT5>::value )
9968 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9969 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9970 :( IsLower<MT5>::value ? j : 0UL ) );
9974 for(
size_t k=kbegin; k<K; ++k ) {
9975 xmm1 = xmm1 +
set( A(i,k) ) * B.load(k,j);
9978 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
9982 for( ; remainder && j<N; ++j )
9986 for( ; (i+2UL) <= M; i+=2UL )
9988 const size_t kbegin( ( IsUpper<MT4>::value )
9989 ?( ( IsLower<MT5>::value )
9990 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
9991 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
9992 :( IsLower<MT5>::value ? j : 0UL ) );
9993 const size_t kend( ( IsLower<MT4>::value )
9994 ?( IsStrictlyLower<MT4>::value ? i+1UL : i+2UL )
10000 for(
size_t k=kbegin; k<kend; ++k ) {
10001 value1 += A(i ,k) * B(k,j);
10002 value2 += A(i+1UL,k) * B(k,j);
10005 (~C)(i ,j) -= value1 * scalar;
10006 (~C)(i+1UL,j) -= value2 * scalar;
10011 const size_t kbegin( ( IsUpper<MT4>::value )
10012 ?( ( IsLower<MT5>::value )
10013 ?(
max( ( IsStrictlyUpper<MT4>::value ? i+1UL : i ), j ) )
10014 :( IsStrictlyUpper<MT4>::value ? i+1UL : i ) )
10015 :( IsLower<MT5>::value ? j : 0UL ) );
10019 for(
size_t k=kbegin; k<K; ++k ) {
10020 value += A(i,k) * B(k,j);
10023 (~C)(i,j) -= value * scalar;
10044 template<
typename MT3
10048 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10049 selectSmallSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10051 const size_t M( A.rows() );
10052 const size_t N( B.columns() );
10053 const size_t K( A.columns() );
10055 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10057 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
10058 BLAZE_INTERNAL_ASSERT( !remainder || ( M - ( M % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
10060 const SIMDType factor(
set( scalar ) );
10064 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL ) {
10065 for(
size_t j=0UL; j<N; ++j )
10067 const size_t kbegin( ( IsLower<MT5>::value )
10068 ?( ( IsUpper<MT4>::value )
10069 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10070 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10071 :( IsUpper<MT4>::value ? i : 0UL ) );
10072 const size_t kend( ( IsUpper<MT5>::value )
10073 ?( ( IsLower<MT4>::value )
10074 ?(
min( i+SIMDSIZE*8UL, K, ( IsStrictlyUpper<MT5>::value ? j : j+1UL ) ) )
10075 :( IsStrictlyUpper<MT5>::value ? j : j+1UL ) )
10076 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*8UL, K ) : K ) );
10078 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10080 for(
size_t k=kbegin; k<kend; ++k ) {
10081 const SIMDType b1(
set( B(k,j) ) );
10082 xmm1 = xmm1 + A.load(i ,k) * b1;
10083 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
10084 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
10085 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
10086 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,k) * b1;
10087 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,k) * b1;
10088 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,k) * b1;
10089 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,k) * b1;
10092 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10093 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
10094 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
10095 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
10096 (~C).store( i+SIMDSIZE*4UL, j, (~C).load(i+SIMDSIZE*4UL,j) - xmm5 * factor );
10097 (~C).store( i+SIMDSIZE*5UL, j, (~C).load(i+SIMDSIZE*5UL,j) - xmm6 * factor );
10098 (~C).store( i+SIMDSIZE*6UL, j, (~C).load(i+SIMDSIZE*6UL,j) - xmm7 * factor );
10099 (~C).store( i+SIMDSIZE*7UL, j, (~C).load(i+SIMDSIZE*7UL,j) - xmm8 * factor );
10103 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10107 for( ; (j+2UL) <= N; j+=2UL )
10109 const size_t kbegin( ( IsLower<MT5>::value )
10110 ?( ( IsUpper<MT4>::value )
10111 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10112 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10113 :( IsUpper<MT4>::value ? i : 0UL ) );
10114 const size_t kend( ( IsUpper<MT5>::value )
10115 ?( ( IsLower<MT4>::value )
10116 ?(
min( i+SIMDSIZE*4UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10117 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10118 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*4UL, K ) : K ) );
10120 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10122 for(
size_t k=kbegin; k<kend; ++k ) {
10123 const SIMDType a1( A.load(i ,k) );
10124 const SIMDType a2( A.load(i+SIMDSIZE ,k) );
10125 const SIMDType a3( A.load(i+SIMDSIZE*2UL,k) );
10126 const SIMDType a4( A.load(i+SIMDSIZE*3UL,k) );
10127 const SIMDType b1(
set( B(k,j ) ) );
10128 const SIMDType b2(
set( B(k,j+1UL) ) );
10129 xmm1 = xmm1 + a1 * b1;
10130 xmm2 = xmm2 + a2 * b1;
10131 xmm3 = xmm3 + a3 * b1;
10132 xmm4 = xmm4 + a4 * b1;
10133 xmm5 = xmm5 + a1 * b2;
10134 xmm6 = xmm6 + a2 * b2;
10135 xmm7 = xmm7 + a3 * b2;
10136 xmm8 = xmm8 + a4 * b2;
10139 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10140 (~C).store( i+SIMDSIZE , j , (~C).load(i+SIMDSIZE ,j ) - xmm2 * factor );
10141 (~C).store( i+SIMDSIZE*2UL, j , (~C).load(i+SIMDSIZE*2UL,j ) - xmm3 * factor );
10142 (~C).store( i+SIMDSIZE*3UL, j , (~C).load(i+SIMDSIZE*3UL,j ) - xmm4 * factor );
10143 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10144 (~C).store( i+SIMDSIZE , j+1UL, (~C).load(i+SIMDSIZE ,j+1UL) - xmm6 * factor );
10145 (~C).store( i+SIMDSIZE*2UL, j+1UL, (~C).load(i+SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
10146 (~C).store( i+SIMDSIZE*3UL, j+1UL, (~C).load(i+SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
10151 const size_t kbegin( ( IsLower<MT5>::value )
10152 ?( ( IsUpper<MT4>::value )
10153 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10154 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10155 :( IsUpper<MT4>::value ? i : 0UL ) );
10156 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, K ) ):( K ) );
10158 SIMDType xmm1, xmm2, xmm3, xmm4;
10160 for(
size_t k=kbegin; k<kend; ++k ) {
10161 const SIMDType b1(
set( B(k,j) ) );
10162 xmm1 = xmm1 + A.load(i ,k) * b1;
10163 xmm2 = xmm2 + A.load(i+SIMDSIZE ,k) * b1;
10164 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,k) * b1;
10165 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,k) * b1;
10168 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10169 (~C).store( i+SIMDSIZE , j, (~C).load(i+SIMDSIZE ,j) - xmm2 * factor );
10170 (~C).store( i+SIMDSIZE*2UL, j, (~C).load(i+SIMDSIZE*2UL,j) - xmm3 * factor );
10171 (~C).store( i+SIMDSIZE*3UL, j, (~C).load(i+SIMDSIZE*3UL,j) - xmm4 * factor );
10175 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10179 for( ; (j+2UL) <= N; j+=2UL )
10181 const size_t kbegin( ( IsLower<MT5>::value )
10182 ?( ( IsUpper<MT4>::value )
10183 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10184 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10185 :( IsUpper<MT4>::value ? i : 0UL ) );
10186 const size_t kend( ( IsUpper<MT5>::value )
10187 ?( ( IsLower<MT4>::value )
10188 ?(
min( i+SIMDSIZE*2UL, K, ( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) ) )
10189 :( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL ) )
10190 :( IsLower<MT4>::value ?
min( i+SIMDSIZE*2UL, K ) : K ) );
10192 SIMDType xmm1, xmm2, xmm3, xmm4;
10194 for(
size_t k=kbegin; k<kend; ++k ) {
10195 const SIMDType a1( A.load(i ,k) );
10196 const SIMDType a2( A.load(i+SIMDSIZE,k) );
10197 const SIMDType b1(
set( B(k,j ) ) );
10198 const SIMDType b2(
set( B(k,j+1UL) ) );
10199 xmm1 = xmm1 + a1 * b1;
10200 xmm2 = xmm2 + a2 * b1;
10201 xmm3 = xmm3 + a1 * b2;
10202 xmm4 = xmm4 + a2 * b2;
10205 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10206 (~C).store( i+SIMDSIZE, j , (~C).load(i+SIMDSIZE,j ) - xmm2 * factor );
10207 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10208 (~C).store( i+SIMDSIZE, j+1UL, (~C).load(i+SIMDSIZE,j+1UL) - xmm4 * factor );
10213 const size_t kbegin( ( IsLower<MT5>::value )
10214 ?( ( IsUpper<MT4>::value )
10215 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10216 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10217 :( IsUpper<MT4>::value ? i : 0UL ) );
10218 const size_t kend( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, K ) ):( K ) );
10220 SIMDType xmm1, xmm2;
10222 for(
size_t k=kbegin; k<kend; ++k ) {
10223 const SIMDType b1(
set( B(k,j) ) );
10224 xmm1 = xmm1 + A.load(i ,k) * b1;
10225 xmm2 = xmm2 + A.load(i+SIMDSIZE,k) * b1;
10228 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10229 (~C).store( i+SIMDSIZE, j, (~C).load(i+SIMDSIZE,j) - xmm2 * factor );
10233 for( ; i<ipos; i+=SIMDSIZE )
10237 for( ; (j+2UL) <= N; j+=2UL )
10239 const size_t kbegin( ( IsLower<MT5>::value )
10240 ?( ( IsUpper<MT4>::value )
10241 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10242 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10243 :( IsUpper<MT4>::value ? i : 0UL ) );
10244 const size_t kend( ( IsUpper<MT5>::value )
10245 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10248 SIMDType xmm1, xmm2;
10250 for(
size_t k=kbegin; k<kend; ++k ) {
10251 const SIMDType a1( A.load(i,k) );
10252 xmm1 = xmm1 + a1 *
set( B(k,j ) );
10253 xmm2 = xmm2 + a1 *
set( B(k,j+1UL) );
10256 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10257 (~C).store( i, j+1UL, (~C).load(i,j+1UL) - xmm2 * factor );
10262 const size_t kbegin( ( IsLower<MT5>::value )
10263 ?( ( IsUpper<MT4>::value )
10264 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10265 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10266 :( IsUpper<MT4>::value ? i : 0UL ) );
10270 for(
size_t k=kbegin; k<K; ++k ) {
10271 xmm1 = xmm1 + A.load(i,k) *
set( B(k,j) );
10274 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10278 for( ; remainder && i<M; ++i )
10282 for( ; (j+2UL) <= N; j+=2UL )
10284 const size_t kbegin( ( IsLower<MT5>::value )
10285 ?( ( IsUpper<MT4>::value )
10286 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10287 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10288 :( IsUpper<MT4>::value ? i : 0UL ) );
10289 const size_t kend( ( IsUpper<MT5>::value )
10290 ?( IsStrictlyUpper<MT5>::value ? j+1UL : j+2UL )
10296 for(
size_t k=kbegin; k<kend; ++k ) {
10297 value1 += A(i,k) * B(k,j );
10298 value2 += A(i,k) * B(k,j+1UL);
10301 (~C)(i,j ) -= value1 * scalar;
10302 (~C)(i,j+1UL) -= value2 * scalar;
10307 const size_t kbegin( ( IsLower<MT5>::value )
10308 ?( ( IsUpper<MT4>::value )
10309 ?(
max( i, ( IsStrictlyLower<MT5>::value ? j+1UL : j ) ) )
10310 :( IsStrictlyLower<MT5>::value ? j+1UL : j ) )
10311 :( IsUpper<MT4>::value ? i : 0UL ) );
10315 for(
size_t k=kbegin; k<K; ++k ) {
10316 value += A(i,k) * B(k,j);
10319 (~C)(i,j) -= value * scalar;
10339 template<
typename MT3
10343 static inline DisableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10344 selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10346 selectDefaultSubAssignKernel( C, A, B, scalar );
10365 template<
typename MT3
10369 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10370 selectLargeSubAssignKernel( DenseMatrix<MT3,false>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10372 const size_t M( A.rows() );
10373 const size_t N( B.columns() );
10374 const size_t K( A.columns() );
10376 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT5>::value );
10378 const SIMDType factor(
set( scalar ) );
10380 for(
size_t jj=0UL; jj<N; jj+=DMATDMATMULT_DEFAULT_JBLOCK_SIZE )
10382 const size_t jend(
min( jj+DMATDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10384 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
10385 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
10387 for(
size_t ii=0UL; ii<M; ii+=DMATDMATMULT_DEFAULT_IBLOCK_SIZE )
10389 const size_t iend(
min( ii+DMATDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10391 for(
size_t kk=0UL; kk<K; kk+=DMATDMATMULT_DEFAULT_KBLOCK_SIZE )
10393 const size_t ktmp(
min( kk+DMATDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10397 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
10399 const size_t j1( j+SIMDSIZE );
10400 const size_t j2( j+SIMDSIZE*2UL );
10401 const size_t j3( j+SIMDSIZE*3UL );
10405 for( ; (i+2UL) <= iend; i+=2UL )
10407 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10408 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10409 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10410 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
10412 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10414 for(
size_t k=kbegin; k<kend; ++k ) {
10415 const SIMDType a1(
set( A(i ,k) ) );
10416 const SIMDType a2(
set( A(i+1UL,k) ) );
10417 const SIMDType b1( B.load(k,j ) );
10418 const SIMDType b2( B.load(k,j1) );
10419 const SIMDType b3( B.load(k,j2) );
10420 const SIMDType b4( B.load(k,j3) );
10421 xmm1 = xmm1 + a1 * b1;
10422 xmm2 = xmm2 + a1 * b2;
10423 xmm3 = xmm3 + a1 * b3;
10424 xmm4 = xmm4 + a1 * b4;
10425 xmm5 = xmm5 + a2 * b1;
10426 xmm6 = xmm6 + a2 * b2;
10427 xmm7 = xmm7 + a2 * b3;
10428 xmm8 = xmm8 + a2 * b4;
10431 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10432 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10433 (~C).store( i , j2, (~C).load(i ,j2) - xmm3 * factor );
10434 (~C).store( i , j3, (~C).load(i ,j3) - xmm4 * factor );
10435 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm5 * factor );
10436 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm6 * factor );
10437 (~C).store( i+1UL, j2, (~C).load(i+1UL,j2) - xmm7 * factor );
10438 (~C).store( i+1UL, j3, (~C).load(i+1UL,j3) - xmm8 * factor );
10443 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10444 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10445 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10446 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*4UL, ktmp ) ):( ktmp ) ) );
10448 SIMDType xmm1, xmm2, xmm3, xmm4;
10450 for(
size_t k=kbegin; k<kend; ++k ) {
10451 const SIMDType a1(
set( A(i,k) ) );
10452 xmm1 = xmm1 + a1 * B.load(k,j );
10453 xmm2 = xmm2 + a1 * B.load(k,j1);
10454 xmm3 = xmm3 + a1 * B.load(k,j2);
10455 xmm4 = xmm4 + a1 * B.load(k,j3);
10458 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10459 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10460 (~C).store( i, j2, (~C).load(i,j2) - xmm3 * factor );
10461 (~C).store( i, j3, (~C).load(i,j3) - xmm4 * factor );
10465 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
10467 const size_t j1( j+SIMDSIZE );
10471 for( ; (i+4UL) <= iend; i+=4UL )
10473 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10474 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10475 const size_t kend (
min( ( IsLower<MT4>::value )?( i+4UL ):( ktmp ),
10476 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10478 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10480 for(
size_t k=kbegin; k<kend; ++k ) {
10481 const SIMDType a1(
set( A(i ,k) ) );
10482 const SIMDType a2(
set( A(i+1UL,k) ) );
10483 const SIMDType a3(
set( A(i+2UL,k) ) );
10484 const SIMDType a4(
set( A(i+3UL,k) ) );
10485 const SIMDType b1( B.load(k,j ) );
10486 const SIMDType b2( B.load(k,j1) );
10487 xmm1 = xmm1 + a1 * b1;
10488 xmm2 = xmm2 + a1 * b2;
10489 xmm3 = xmm3 + a2 * b1;
10490 xmm4 = xmm4 + a2 * b2;
10491 xmm5 = xmm5 + a3 * b1;
10492 xmm6 = xmm6 + a3 * b2;
10493 xmm7 = xmm7 + a4 * b1;
10494 xmm8 = xmm8 + a4 * b2;
10497 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10498 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10499 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10500 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10501 (~C).store( i+2UL, j , (~C).load(i+2UL,j ) - xmm5 * factor );
10502 (~C).store( i+2UL, j1, (~C).load(i+2UL,j1) - xmm6 * factor );
10503 (~C).store( i+3UL, j , (~C).load(i+3UL,j ) - xmm7 * factor );
10504 (~C).store( i+3UL, j1, (~C).load(i+3UL,j1) - xmm8 * factor );
10507 for( ; (i+2UL) <= iend; i+=2UL )
10509 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10510 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10511 const size_t kend (
min( ( IsLower<MT4>::value )?( i+2UL ):( ktmp ),
10512 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10514 SIMDType xmm1, xmm2, xmm3, xmm4;
10516 for(
size_t k=kbegin; k<kend; ++k ) {
10517 const SIMDType a1(
set( A(i ,k) ) );
10518 const SIMDType a2(
set( A(i+1UL,k) ) );
10519 const SIMDType b1( B.load(k,j ) );
10520 const SIMDType b2( B.load(k,j1) );
10521 xmm1 = xmm1 + a1 * b1;
10522 xmm2 = xmm2 + a1 * b2;
10523 xmm3 = xmm3 + a2 * b1;
10524 xmm4 = xmm4 + a2 * b2;
10527 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10528 (~C).store( i , j1, (~C).load(i ,j1) - xmm2 * factor );
10529 (~C).store( i+1UL, j , (~C).load(i+1UL,j ) - xmm3 * factor );
10530 (~C).store( i+1UL, j1, (~C).load(i+1UL,j1) - xmm4 * factor );
10535 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10536 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10537 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10538 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE*2UL, ktmp ) ):( ktmp ) ) );
10540 SIMDType xmm1, xmm2;
10542 for(
size_t k=kbegin; k<kend; ++k ) {
10543 const SIMDType a1(
set( A(i,k) ) );
10544 xmm1 = xmm1 + a1 * B.load(k,j );
10545 xmm2 = xmm2 + a1 * B.load(k,j1);
10548 (~C).store( i, j , (~C).load(i,j ) - xmm1 * factor );
10549 (~C).store( i, j1, (~C).load(i,j1) - xmm2 * factor );
10553 for( ; j<jpos; j+=SIMDSIZE )
10555 for(
size_t i=ii; i<iend; ++i )
10557 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10558 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10559 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10560 ( IsUpper<MT5>::value )?(
min( j+SIMDSIZE, ktmp ) ):( ktmp ) ) );
10564 for(
size_t k=kbegin; k<kend; ++k ) {
10565 const SIMDType a1(
set( A(i,k) ) );
10566 xmm1 = xmm1 + a1 * B.load(k,j);
10569 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10573 for( ; remainder && j<jend; ++j )
10575 for(
size_t i=ii; i<iend; ++i )
10577 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10578 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10579 const size_t kend (
min( ( IsLower<MT4>::value )?( i+1UL ):( ktmp ),
10580 ( IsUpper<MT5>::value )?(
min( j+1UL, ktmp ) ):( ktmp ) ) );
10584 for(
size_t k=kbegin; k<kend; ++k ) {
10585 value += A(i,k) * B(k,j);
10588 (~C)(i,j) -= value * scalar;
10612 template<
typename MT3
10616 static inline EnableIf_< UseVectorizedDefaultKernel<MT3,MT4,MT5,ST2> >
10617 selectLargeSubAssignKernel( DenseMatrix<MT3,true>& C,
const MT4& A,
const MT5& B, ST2 scalar )
10619 const size_t M( A.rows() );
10620 const size_t N( B.columns() );
10621 const size_t K( A.columns() );
10623 const bool remainder( !IsPadded<MT3>::value || !IsPadded<MT4>::value );
10625 const SIMDType factor(
set( scalar ) );
10627 for(
size_t ii=0UL; ii<M; ii+=TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE )
10629 const size_t iend(
min( ii+TDMATTDMATMULT_DEFAULT_IBLOCK_SIZE, M ) );
10631 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
10632 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
10634 for(
size_t jj=0UL; jj<N; jj+=TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE )
10636 const size_t jend(
min( jj+TDMATTDMATMULT_DEFAULT_JBLOCK_SIZE, N ) );
10638 for(
size_t kk=0UL; kk<K; kk+=TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE )
10640 const size_t ktmp(
min( kk+TDMATTDMATMULT_DEFAULT_KBLOCK_SIZE, K ) );
10644 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
10646 const size_t i1( i+SIMDSIZE );
10647 const size_t i2( i+SIMDSIZE*2UL );
10648 const size_t i3( i+SIMDSIZE*3UL );
10652 for( ; (j+2UL) <= jend; j+=2UL )
10654 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10655 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10656 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
10657 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10659 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10661 for(
size_t k=kbegin; k<kend; ++k ) {
10662 const SIMDType a1( A.load(i ,k) );
10663 const SIMDType a2( A.load(i1,k) );
10664 const SIMDType a3( A.load(i2,k) );
10665 const SIMDType a4( A.load(i3,k) );
10666 const SIMDType b1(
set( B(k,j ) ) );
10667 const SIMDType b2(
set( B(k,j+1UL) ) );
10668 xmm1 = xmm1 + a1 * b1;
10669 xmm2 = xmm2 + a2 * b1;
10670 xmm3 = xmm3 + a3 * b1;
10671 xmm4 = xmm4 + a4 * b1;
10672 xmm5 = xmm5 + a1 * b2;
10673 xmm6 = xmm6 + a2 * b2;
10674 xmm7 = xmm7 + a3 * b2;
10675 xmm8 = xmm8 + a4 * b2;
10678 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10679 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10680 (~C).store( i2, j , (~C).load(i2,j ) - xmm3 * factor );
10681 (~C).store( i3, j , (~C).load(i3,j ) - xmm4 * factor );
10682 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm5 * factor );
10683 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm6 * factor );
10684 (~C).store( i2, j+1UL, (~C).load(i2,j+1UL) - xmm7 * factor );
10685 (~C).store( i3, j+1UL, (~C).load(i3,j+1UL) - xmm8 * factor );
10690 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10691 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10692 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*4UL, ktmp ) ):( ktmp ),
10693 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10695 SIMDType xmm1, xmm2, xmm3, xmm4;
10697 for(
size_t k=kbegin; k<kend; ++k ) {
10698 const SIMDType b1(
set( B(k,j) ) );
10699 xmm1 = xmm1 + A.load(i ,k) * b1;
10700 xmm2 = xmm2 + A.load(i1,k) * b1;
10701 xmm3 = xmm3 + A.load(i2,k) * b1;
10702 xmm4 = xmm4 + A.load(i3,k) * b1;
10705 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10706 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10707 (~C).store( i2, j, (~C).load(i2,j) - xmm3 * factor );
10708 (~C).store( i3, j, (~C).load(i3,j) - xmm4 * factor );
10712 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
10714 const size_t i1( i+SIMDSIZE );
10718 for( ; (j+4UL) <= jend; j+=4UL )
10720 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10721 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10722 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10723 ( IsUpper<MT5>::value )?( j+4UL ):( ktmp ) ) );
10725 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10727 for(
size_t k=kbegin; k<kend; ++k ) {
10728 const SIMDType a1( A.load(i ,k) );
10729 const SIMDType a2( A.load(i1,k) );
10730 const SIMDType b1(
set( B(k,j ) ) );
10731 const SIMDType b2(
set( B(k,j+1UL) ) );
10732 const SIMDType b3(
set( B(k,j+2UL) ) );
10733 const SIMDType b4(
set( B(k,j+3UL) ) );
10734 xmm1 = xmm1 + a1 * b1;
10735 xmm2 = xmm2 + a2 * b1;
10736 xmm3 = xmm3 + a1 * b2;
10737 xmm4 = xmm4 + a2 * b2;
10738 xmm5 = xmm5 + a1 * b3;
10739 xmm6 = xmm6 + a2 * b3;
10740 xmm7 = xmm7 + a1 * b4;
10741 xmm8 = xmm8 + a2 * b4;
10744 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10745 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10746 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10747 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10748 (~C).store( i , j+2UL, (~C).load(i ,j+2UL) - xmm5 * factor );
10749 (~C).store( i1, j+2UL, (~C).load(i1,j+2UL) - xmm6 * factor );
10750 (~C).store( i , j+3UL, (~C).load(i ,j+3UL) - xmm7 * factor );
10751 (~C).store( i1, j+3UL, (~C).load(i1,j+3UL) - xmm8 * factor );
10754 for( ; (j+2UL) <= jend; j+=2UL )
10756 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10757 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10758 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10759 ( IsUpper<MT5>::value )?( j+2UL ):( ktmp ) ) );
10761 SIMDType xmm1, xmm2, xmm3, xmm4;
10763 for(
size_t k=kbegin; k<kend; ++k ) {
10764 const SIMDType a1( A.load(i ,k) );
10765 const SIMDType a2( A.load(i1,k) );
10766 const SIMDType b1(
set( B(k,j ) ) );
10767 const SIMDType b2(
set( B(k,j+1UL) ) );
10768 xmm1 = xmm1 + a1 * b1;
10769 xmm2 = xmm2 + a2 * b1;
10770 xmm3 = xmm3 + a1 * b2;
10771 xmm4 = xmm4 + a2 * b2;
10774 (~C).store( i , j , (~C).load(i ,j ) - xmm1 * factor );
10775 (~C).store( i1, j , (~C).load(i1,j ) - xmm2 * factor );
10776 (~C).store( i , j+1UL, (~C).load(i ,j+1UL) - xmm3 * factor );
10777 (~C).store( i1, j+1UL, (~C).load(i1,j+1UL) - xmm4 * factor );
10782 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10783 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10784 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE*2UL, ktmp ) ):( ktmp ),
10785 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10787 SIMDType xmm1, xmm2;
10789 for(
size_t k=kbegin; k<kend; ++k ) {
10790 const SIMDType b1(
set( B(k,j) ) );
10791 xmm1 = xmm1 + A.load(i ,k) * b1;
10792 xmm2 = xmm2 + A.load(i1,k) * b1;
10795 (~C).store( i , j, (~C).load(i ,j) - xmm1 * factor );
10796 (~C).store( i1, j, (~C).load(i1,j) - xmm2 * factor );
10800 for( ; i<ipos; i+=SIMDSIZE )
10802 for(
size_t j=jj; j<jend; ++j )
10804 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10805 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10806 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+SIMDSIZE, ktmp ) ):( ktmp ),
10807 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10811 for(
size_t k=kbegin; k<kend; ++k ) {
10812 const SIMDType b1(
set( B(k,j) ) );
10813 xmm1 = xmm1 + A.load(i,k) * b1;
10816 (~C).store( i, j, (~C).load(i,j) - xmm1 * factor );
10820 for( ; remainder && i<iend; ++i )
10822 for(
size_t j=jj; j<jend; ++j )
10824 const size_t kbegin(
max( ( IsUpper<MT4>::value )?(
max( i, kk ) ):( kk ),
10825 ( IsLower<MT5>::value )?(
max( j, kk ) ):( kk ) ) );
10826 const size_t kend (
min( ( IsLower<MT4>::value )?(
min( i+1UL, ktmp ) ):( ktmp ),
10827 ( IsUpper<MT5>::value )?( j+1UL ):( ktmp ) ) );
10831 for(
size_t k=kbegin; k<kend; ++k ) {
10832 value += A(i,k) * B(k,j);
10835 (~C)(i,j) -= value * scalar;
10858 template<
typename MT3
10862 static inline DisableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
10863 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10865 selectLargeSubAssignKernel( C, A, B, scalar );
10870 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
10884 template<
typename MT3
10888 static inline EnableIf_< UseBlasKernel<MT3,MT4,MT5,ST2> >
10889 selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10891 typedef ElementType_<MT3> ET;
10893 if( IsTriangular<MT4>::value ) {
10894 ResultType_<MT3> tmp(
serial( B ) );
10895 trmm( tmp, A, CblasLeft, ( IsLower<MT4>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10896 subAssign( C, tmp );
10898 else if( IsTriangular<MT5>::value ) {
10899 ResultType_<MT3> tmp(
serial( A ) );
10900 trmm( tmp, B, CblasRight, ( IsLower<MT5>::value )?( CblasLower ):( CblasUpper ), ET(scalar) );
10901 subAssign( C, tmp );
10904 gemm( C, A, B, ET(-scalar), ET(1) );
10937 template<
typename MT
10939 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
10940 smpAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
10947 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
10948 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
10950 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
10953 else if( left.columns() == 0UL ) {
10968 smpAssign( ~lhs, A * B * rhs.scalar_ );
10987 template<
typename MT
10989 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
10990 smpAssign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
10994 typedef IfTrue_< SO, ResultType, OppositeType > TmpType;
11006 const TmpType tmp( rhs );
11026 template<
typename MT
11028 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
11029 smpAddAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11036 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11037 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11039 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11076 template<
typename MT
11078 friend inline EnableIf_< IsEvaluationRequired<MT,MT1,MT2> >
11079 smpSubAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11086 LeftOperand_<MMM> left ( rhs.matrix_.leftOperand() );
11087 RightOperand_<MMM> right( rhs.matrix_.rightOperand() );
11089 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11171 template<
typename T1
11173 inline const TDMatDMatMultExpr<T1,T2>
11197 template<
typename MT1,
typename MT2 >
11214 template<
typename MT1,
typename MT2 >
11231 template<
typename MT1,
typename MT2 >
11233 :
public BoolConstant< And< IsAligned<MT1>, IsAligned<MT2> >::value >
11249 template<
typename MT1,
typename MT2 >
11251 :
public BoolConstant< And< IsLower<MT1>, IsLower<MT2> >::value >
11267 template<
typename MT1,
typename MT2 >
11269 :
public BoolConstant< And< IsUniLower<MT1>, IsUniLower<MT2> >::value >
11285 template<
typename MT1,
typename MT2 >
11287 :
public BoolConstant< Or< And< IsStrictlyLower<MT1>, IsLower<MT2> >
11288 , And< IsStrictlyLower<MT2>, IsLower<MT1> > >::value >
11304 template<
typename MT1,
typename MT2 >
11306 :
public BoolConstant< And< IsUpper<MT1>, IsUpper<MT2> >::value >
11322 template<
typename MT1,
typename MT2 >
11324 :
public BoolConstant< And< IsUniUpper<MT1>, IsUniUpper<MT2> >::value >
11340 template<
typename MT1,
typename MT2 >
11342 :
public BoolConstant< Or< And< IsStrictlyUpper<MT1>, IsUpper<MT2> >
11343 , And< IsStrictlyUpper<MT2>, IsUpper<MT1> > >::value >
11359 template<
typename MT1,
typename MT2,
typename VT >
11364 using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11365 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
11366 , IsDenseVector<VT>, IsColumnVector<VT> >
11367 , TDMatDVecMultExprTrait_< MT1, DMatDVecMultExprTrait_<MT2,VT> >
11377 template<
typename MT1,
typename MT2,
typename VT >
11382 using Type = If_< And< IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11383 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2>
11384 , IsSparseVector<VT>, IsColumnVector<VT> >
11385 , TDMatDVecMultExprTrait_< MT1, DMatSVecMultExprTrait_<MT2,VT> >
11395 template<
typename VT,
typename MT1,
typename MT2 >
11400 using Type = If_< And< IsDenseVector<VT>, IsRowVector<VT>
11401 , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11402 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
11403 , TDVecDMatMultExprTrait_< TDVecTDMatMultExprTrait_<VT,MT1>, MT2 >
11413 template<
typename VT,
typename MT1,
typename MT2 >
11418 using Type = If_< And< IsSparseVector<VT>, IsRowVector<VT>
11419 , IsDenseMatrix<MT1>, IsColumnMajorMatrix<MT1>
11420 , IsDenseMatrix<MT2>, IsRowMajorMatrix<MT2> >
11421 , TDVecDMatMultExprTrait_< TSVecTDMatMultExprTrait_<VT,MT1>, MT2 >
11431 template<
typename MT1,
typename MT2,
bool AF >
11436 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT1,AF>
11437 , SubmatrixExprTrait_<const MT2,AF> >;
11446 template<
typename MT1,
typename MT2 >
11451 using Type = MultExprTrait_< RowExprTrait_<const MT1>, MT2 >;
11460 template<
typename MT1,
typename MT2 >
11465 using Type = MultExprTrait_< MT1, ColumnExprTrait_<const MT2> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
If_< IsExpression< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:243
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Constraint on the data type.
Header file for kernel specific block sizes.
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the Rows type trait.
Header file for the IsUniUpper type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Compile time check for triangular matrix types.This type trait tests whether or not the given templat...
Definition: IsTriangular.h:87
Header file for basic type definitions.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:438
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:437
Header file for the ColumnExprTrait class template.
Header file for the IsSame and IsStrictlySame type traits.
IfTrue_< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:249
ElementType_< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:234
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
Header file for the IsColumnMajorMatrix type trait.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
Header file for the IsRowVector type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:418
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:338
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the IsUniLower type trait.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:408
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
Header file for the IsComplexDouble type trait.
ElementType_< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:128
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
TDMatDMatMultExpr< MT1, MT2 > This
Type of this TDMatDMatMultExpr instance.
Definition: TDMatDMatMultExpr.h:230
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Header file for the TSVecTDMatMultExprTrait class template.
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the Or class template.
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:233
Header file for the TDMatSVecMultExprTrait class template.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
Header file for the Columns type trait.
Header file for the Not class template.
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:364
Header file for the DMatDVecMultExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
CompositeType_< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:159
Header file for the IsStrictlyTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
BLAZE_ALWAYS_INLINE MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:254
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:374
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:428
Header file for the RowExprTrait class template.
Header file for all forward declarations for expression class templates.
Header file for the IsDenseMatrix type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:236
OppositeType_< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:232
CompositeType_< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:158
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
#define BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:109
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
Header file for the IsSparseVector type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for the MatScalarMultExpr base class.
Header file for run time assertion macros.
Utility type for generic codes.
If_< IsExpression< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:240
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:275
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:296
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:354
Constraints on the storage order of matrix types.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:237
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:235
typename T::OppositeType OppositeType_
Alias declaration for nested OppositeType type definitions.The OppositeType_ alias declaration provid...
Definition: Aliases.h:243
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Header file for the IsDenseVector type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:396
ResultType_< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
Header file for the AreSIMDCombinable type trait.
Header file for the IsRowMajorMatrix type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:384
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
IfTrue_< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:246
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
Header file for the TDVecDMatMultExprTrait class template.
MultTrait_< RT1, RT2 > ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:231
Header file for the TDMatDVecMultExprTrait class template.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:290
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the DMatSVecMultExprTrait class template.
Header file for the IsColumnVector type trait.
Constraint on the data type.
ResultType_< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the TDVecTDMatMultExprTrait class template.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.