35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 140 template<
typename MT1
146 class TDMatDMatMultExpr
147 :
public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
148 ,
private Computation
162 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
167 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
171 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
172 static constexpr
bool HERM = ( HF && !( LF || UF ) );
173 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
174 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
183 template<
typename T1,
typename T2,
typename T3 >
193 template<
typename T1,
typename T2,
typename T3 >
194 static constexpr
bool UseBlasKernel_v =
197 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
198 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
199 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
200 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
201 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
202 IsBLASCompatible_v< ElementType_t<T1> > &&
203 IsBLASCompatible_v< ElementType_t<T2> > &&
204 IsBLASCompatible_v< ElementType_t<T3> > &&
215 template<
typename T1,
typename T2,
typename T3 >
216 static constexpr
bool UseVectorizedDefaultKernel_v =
217 ( useOptimizedKernels &&
218 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
219 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
220 IsSIMDCombinable_v< ElementType_t<T1>
291 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
292 MT1::simdEnabled && MT2::simdEnabled &&
293 HasSIMDAdd_v<ET1,ET2> &&
294 HasSIMDMult_v<ET1,ET2> );
331 if( IsDiagonal_v<MT1> ) {
334 else if( IsDiagonal_v<MT2> ) {
337 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
338 const size_t begin( ( IsUpper_v<MT1> )
339 ?( ( IsLower_v<MT2> )
340 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
341 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
342 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
343 :( ( IsLower_v<MT2> )
344 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
346 const size_t end( ( IsLower_v<MT1> )
347 ?( ( IsUpper_v<MT2> )
348 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
349 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
350 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
351 :( ( IsUpper_v<MT2> )
352 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
353 :(
lhs_.columns() ) ) );
377 if( i >=
lhs_.rows() ) {
380 if( j >=
rhs_.columns() ) {
392 inline size_t rows() const noexcept {
403 return rhs_.columns();
433 template<
typename T >
434 inline bool canAlias(
const T* alias )
const noexcept {
435 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
445 template<
typename T >
446 inline bool isAliased(
const T* alias )
const noexcept {
447 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
457 return lhs_.isAligned() &&
rhs_.isAligned();
468 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
470 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
471 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
472 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
495 template<
typename MT
504 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
507 else if( rhs.lhs_.columns() == 0UL ) {
522 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
538 template<
typename MT3
541 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
543 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
544 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
545 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
546 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
547 selectSmallAssignKernel( C, A, B );
549 selectBlasAssignKernel( C, A, B );
568 template<
typename MT3
571 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
572 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
574 const size_t M( A.rows() );
575 const size_t N( B.columns() );
576 const size_t K( A.columns() );
580 for(
size_t i=0UL; i<M; ++i )
582 const size_t kbegin( ( IsUpper_v<MT4> )
583 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
585 const size_t kend( ( IsLower_v<MT4> )
586 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
590 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
591 for(
size_t j=0UL; j<N; ++j ) {
598 const size_t jbegin( ( IsUpper_v<MT5> )
599 ?( ( IsStrictlyUpper_v<MT5> )
600 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
601 :(
UPP ?
max(i,kbegin) : kbegin ) )
602 :(
UPP ? i : 0UL ) );
603 const size_t jend( ( IsLower_v<MT5> )
604 ?( ( IsStrictlyLower_v<MT5> )
605 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
606 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
607 :(
LOW ? i+1UL : N ) );
609 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
610 for(
size_t j=0UL; j<jbegin; ++j ) {
614 else if( IsStrictlyUpper_v<MT5> ) {
617 for(
size_t j=jbegin; j<jend; ++j ) {
618 C(i,j) = A(i,kbegin) * B(kbegin,j);
620 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
621 for(
size_t j=jend; j<N; ++j ) {
625 else if( IsStrictlyLower_v<MT5> ) {
630 for(
size_t k=kbegin+1UL; k<kend; ++k )
632 const size_t jbegin( ( IsUpper_v<MT5> )
633 ?( ( IsStrictlyUpper_v<MT5> )
637 const size_t jend( ( IsLower_v<MT5> )
638 ?( ( IsStrictlyLower_v<MT5> )
639 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
640 :(
LOW ?
min(i+1UL,k) : k ) )
641 :(
LOW ? i+1UL : N ) );
643 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
646 for(
size_t j=jbegin; j<jend; ++j ) {
647 C(i,j) += A(i,k) * B(k,j);
649 if( IsLower_v<MT5> ) {
650 C(i,jend) = A(i,k) * B(k,jend);
656 for(
size_t i=1UL; i<M; ++i ) {
657 for(
size_t j=0UL; j<i; ++j ) {
658 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
680 template<
typename MT3
683 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
684 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
686 const size_t M( A.rows() );
687 const size_t N( B.columns() );
688 const size_t K( A.columns() );
692 for(
size_t j=0UL; j<N; ++j )
694 const size_t kbegin( ( IsLower_v<MT5> )
695 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
697 const size_t kend( ( IsUpper_v<MT5> )
698 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
702 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
703 for(
size_t i=0UL; i<M; ++i ) {
710 const size_t ibegin( ( IsLower_v<MT4> )
711 ?( ( IsStrictlyLower_v<MT4> )
712 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
713 :(
LOW ?
max(j,kbegin) : kbegin ) )
714 :(
LOW ? j : 0UL ) );
715 const size_t iend( ( IsUpper_v<MT4> )
716 ?( ( IsStrictlyUpper_v<MT4> )
717 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
718 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
719 :(
UPP ? j+1UL : M ) );
721 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
722 for(
size_t i=0UL; i<ibegin; ++i ) {
726 else if( IsStrictlyLower_v<MT4> ) {
729 for(
size_t i=ibegin; i<iend; ++i ) {
730 C(i,j) = A(i,kbegin) * B(kbegin,j);
732 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
733 for(
size_t i=iend; i<M; ++i ) {
737 else if( IsStrictlyUpper_v<MT4> ) {
742 for(
size_t k=kbegin+1UL; k<kend; ++k )
744 const size_t ibegin( ( IsLower_v<MT4> )
745 ?( ( IsStrictlyLower_v<MT4> )
749 const size_t iend( ( IsUpper_v<MT4> )
750 ?( ( IsStrictlyUpper_v<MT4> )
751 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
752 :(
UPP ?
min(j+1UL,k) : k ) )
753 :(
UPP ? j+1UL : M ) );
755 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
758 for(
size_t i=ibegin; i<iend; ++i ) {
759 C(i,j) += A(i,k) * B(k,j);
761 if( IsUpper_v<MT4> ) {
762 C(iend,j) = A(iend,k) * B(k,j);
768 for(
size_t j=1UL; j<N; ++j ) {
769 for(
size_t i=0UL; i<j; ++i ) {
770 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
792 template<
typename MT3
795 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
796 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
798 constexpr
size_t block( BLOCK_SIZE );
800 const size_t M( A.rows() );
801 const size_t N( B.columns() );
803 for(
size_t ii=0UL; ii<M; ii+=block ) {
804 const size_t iend(
min( M, ii+block ) );
805 for(
size_t jj=0UL; jj<N; jj+=block ) {
806 const size_t jend(
min( N, jj+block ) );
807 for(
size_t i=ii; i<iend; ++i )
809 const size_t jbegin( ( IsUpper_v<MT4> )
810 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
812 const size_t jpos( ( IsLower_v<MT4> )
813 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
816 if( IsUpper_v<MT4> ) {
817 for(
size_t j=jj; j<jbegin; ++j ) {
821 for(
size_t j=jbegin; j<jpos; ++j ) {
822 C(i,j) = A(i,j) * B(j,j);
824 if( IsLower_v<MT4> ) {
825 for(
size_t j=jpos; j<jend; ++j ) {
850 template<
typename MT3
853 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
854 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
856 const size_t M( A.rows() );
857 const size_t N( B.columns() );
859 for(
size_t j=0UL; j<N; ++j )
861 const size_t ibegin( ( IsLower_v<MT4> )
862 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
864 const size_t iend( ( IsUpper_v<MT4> )
865 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
869 if( IsLower_v<MT4> ) {
870 for(
size_t i=0UL; i<ibegin; ++i ) {
874 for(
size_t i=ibegin; i<iend; ++i ) {
875 C(i,j) = A(i,j) * B(j,j);
877 if( IsUpper_v<MT4> ) {
878 for(
size_t i=iend; i<M; ++i ) {
901 template<
typename MT3
904 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
905 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
907 const size_t M( A.rows() );
908 const size_t N( B.columns() );
910 for(
size_t i=0UL; i<M; ++i )
912 const size_t jbegin( ( IsUpper_v<MT5> )
913 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
915 const size_t jend( ( IsLower_v<MT5> )
916 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
920 if( IsUpper_v<MT5> ) {
921 for(
size_t j=0UL; j<jbegin; ++j ) {
925 for(
size_t j=jbegin; j<jend; ++j ) {
926 C(i,j) = A(i,i) * B(i,j);
928 if( IsLower_v<MT5> ) {
929 for(
size_t j=jend; j<N; ++j ) {
952 template<
typename MT3
955 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
956 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
958 constexpr
size_t block( BLOCK_SIZE );
960 const size_t M( A.rows() );
961 const size_t N( B.columns() );
963 for(
size_t jj=0UL; jj<N; jj+=block ) {
964 const size_t jend(
min( N, jj+block ) );
965 for(
size_t ii=0UL; ii<M; ii+=block ) {
966 const size_t iend(
min( M, ii+block ) );
967 for(
size_t j=jj; j<jend; ++j )
969 const size_t ibegin( ( IsLower_v<MT5> )
970 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
972 const size_t ipos( ( IsUpper_v<MT5> )
973 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
976 if( IsLower_v<MT5> ) {
977 for(
size_t i=ii; i<ibegin; ++i ) {
981 for(
size_t i=ibegin; i<ipos; ++i ) {
982 C(i,j) = A(i,i) * B(i,j);
984 if( IsUpper_v<MT5> ) {
985 for(
size_t i=ipos; i<iend; ++i ) {
1010 template<
typename MT3
1013 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1014 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1018 for(
size_t i=0UL; i<A.rows(); ++i ) {
1019 C(i,i) = A(i,i) * B(i,i);
1039 template<
typename MT3
1042 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1043 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1045 selectDefaultAssignKernel( C, A, B );
1065 template<
typename MT3
1068 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1069 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1071 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1073 const size_t M( A.rows() );
1074 const size_t N( B.columns() );
1075 const size_t K( A.columns() );
1079 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1089 if( IsIntegral_v<ElementType> )
1092 for(
size_t i=0UL; i<M; ++i )
1094 const size_t kbegin( ( IsUpper_v<MT4> )
1095 ?( ( IsLower_v<MT5> )
1096 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1097 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1098 :( IsLower_v<MT5> ? j : 0UL ) );
1099 const size_t kend( ( IsLower_v<MT4> )
1100 ?( ( IsUpper_v<MT5> )
1101 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
1102 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1103 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
1105 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1107 for(
size_t k=kbegin; k<kend; ++k ) {
1108 const SIMDType a1(
set( A(i,k) ) );
1109 xmm1 += a1 * B.load(k,j );
1110 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1111 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1112 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1113 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1114 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
1115 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
1116 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
1119 C.store( i, j , xmm1 );
1121 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1122 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1123 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1124 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
1125 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
1126 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
1135 for( ; (i+2UL) <= M; i+=2UL )
1137 const size_t kbegin( ( IsUpper_v<MT4> )
1138 ?( ( IsLower_v<MT5> )
1139 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1140 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1141 :( IsLower_v<MT5> ? j : 0UL ) );
1142 const size_t kend( ( IsLower_v<MT4> )
1143 ?( ( IsUpper_v<MT5> )
1144 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
1145 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1146 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
1148 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1150 for(
size_t k=kbegin; k<kend; ++k ) {
1151 const SIMDType a1(
set( A(i ,k) ) );
1152 const SIMDType a2(
set( A(i+1UL,k) ) );
1170 C.store( i , j , xmm1 );
1172 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1173 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1174 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
1175 C.store( i+1UL, j , xmm6 );
1176 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
1177 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
1178 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
1179 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
1184 const size_t kbegin( ( IsUpper_v<MT4> )
1185 ?( ( IsLower_v<MT5> )
1186 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1187 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1188 :( IsLower_v<MT5> ? j : 0UL ) );
1189 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
1191 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1193 for(
size_t k=kbegin; k<kend; ++k ) {
1194 const SIMDType a1(
set( A(i,k) ) );
1195 xmm1 += a1 * B.load(k,j );
1196 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1197 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1198 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1199 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1202 C.store( i, j , xmm1 );
1204 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1205 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1206 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1213 size_t i(
LOW ? j : 0UL );
1215 for( ; (i+2UL) <= iend; i+=2UL )
1217 const size_t kbegin( ( IsUpper_v<MT4> )
1218 ?( ( IsLower_v<MT5> )
1219 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1220 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1221 :( IsLower_v<MT5> ? j : 0UL ) );
1222 const size_t kend( ( IsLower_v<MT4> )
1223 ?( ( IsUpper_v<MT5> )
1224 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1225 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1226 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1228 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1230 for(
size_t k=kbegin; k<kend; ++k ) {
1231 const SIMDType a1(
set( A(i ,k) ) );
1232 const SIMDType a2(
set( A(i+1UL,k) ) );
1247 C.store( i , j , xmm1 );
1249 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1250 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1251 C.store( i+1UL, j , xmm5 );
1252 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1253 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1254 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1259 const size_t kbegin( ( IsUpper_v<MT4> )
1260 ?( ( IsLower_v<MT5> )
1261 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1262 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1263 :( IsLower_v<MT5> ? j : 0UL ) );
1264 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1268 for(
size_t k=kbegin; k<kend; ++k ) {
1269 const SIMDType a1(
set( A(i,k) ) );
1270 xmm1 += a1 * B.load(k,j );
1271 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1272 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1273 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1276 C.store( i, j , xmm1 );
1278 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1279 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1286 size_t i(
LOW ? j : 0UL );
1288 for( ; (i+2UL) <= iend; i+=2UL )
1290 const size_t kbegin( ( IsUpper_v<MT4> )
1291 ?( ( IsLower_v<MT5> )
1292 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1293 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1294 :( IsLower_v<MT5> ? j : 0UL ) );
1295 const size_t kend( ( IsLower_v<MT4> )
1296 ?( ( IsUpper_v<MT5> )
1297 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1298 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1299 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1301 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1303 for(
size_t k=kbegin; k<kend; ++k ) {
1304 const SIMDType a1(
set( A(i ,k) ) );
1305 const SIMDType a2(
set( A(i+1UL,k) ) );
1317 C.store( i , j , xmm1 );
1319 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1320 C.store( i+1UL, j , xmm4 );
1321 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1322 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1327 const size_t kbegin( ( IsUpper_v<MT4> )
1328 ?( ( IsLower_v<MT5> )
1329 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1330 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1331 :( IsLower_v<MT5> ? j : 0UL ) );
1332 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1336 for(
size_t k=kbegin; k<kend; ++k ) {
1337 const SIMDType a1(
set( A(i,k) ) );
1338 xmm1 += a1 * B.load(k,j );
1339 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1340 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1343 C.store( i, j , xmm1 );
1345 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1352 size_t i(
LOW ? j : 0UL );
1354 for( ; (i+4UL) <= iend; i+=4UL )
1356 const size_t kbegin( ( IsUpper_v<MT4> )
1357 ?( ( IsLower_v<MT5> )
1358 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1359 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1360 :( IsLower_v<MT5> ? j : 0UL ) );
1361 const size_t kend( ( IsLower_v<MT4> )
1362 ?( ( IsUpper_v<MT5> )
1363 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1364 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1365 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1367 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1369 for(
size_t k=kbegin; k<kend; ++k ) {
1370 const SIMDType a1(
set( A(i ,k) ) );
1371 const SIMDType a2(
set( A(i+1UL,k) ) );
1372 const SIMDType a3(
set( A(i+2UL,k) ) );
1373 const SIMDType a4(
set( A(i+3UL,k) ) );
1386 C.store( i , j , xmm1 );
1388 C.store( i+1UL, j , xmm3 );
1389 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1390 C.store( i+2UL, j , xmm5 );
1391 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1392 C.store( i+3UL, j , xmm7 );
1393 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1396 for( ; (i+3UL) <= iend; i+=3UL )
1398 const size_t kbegin( ( IsUpper_v<MT4> )
1399 ?( ( IsLower_v<MT5> )
1400 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1401 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1402 :( IsLower_v<MT5> ? j : 0UL ) );
1403 const size_t kend( ( IsLower_v<MT4> )
1404 ?( ( IsUpper_v<MT5> )
1405 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1406 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1407 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1409 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1411 for(
size_t k=kbegin; k<kend; ++k ) {
1412 const SIMDType a1(
set( A(i ,k) ) );
1413 const SIMDType a2(
set( A(i+1UL,k) ) );
1414 const SIMDType a3(
set( A(i+2UL,k) ) );
1425 C.store( i , j , xmm1 );
1427 C.store( i+1UL, j , xmm3 );
1428 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1429 C.store( i+2UL, j , xmm5 );
1430 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1433 for( ; (i+2UL) <= iend; i+=2UL )
1435 const size_t kbegin( ( IsUpper_v<MT4> )
1436 ?( ( IsLower_v<MT5> )
1437 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1438 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1439 :( IsLower_v<MT5> ? j : 0UL ) );
1440 const size_t kend( ( IsLower_v<MT4> )
1441 ?( ( IsUpper_v<MT5> )
1442 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1443 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1444 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1446 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1449 for( ; (k+2UL) <= kend; k+=2UL ) {
1450 const SIMDType a1(
set( A(i ,k ) ) );
1451 const SIMDType a2(
set( A(i+1UL,k ) ) );
1452 const SIMDType a3(
set( A(i ,k+1UL) ) );
1453 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
1454 const SIMDType b1( B.load(k ,j ) );
1456 const SIMDType b3( B.load(k+1UL,j ) );
1468 for( ; k<kend; ++k ) {
1469 const SIMDType a1(
set( A(i ,k) ) );
1470 const SIMDType a2(
set( A(i+1UL,k) ) );
1479 C.store( i , j , xmm1+xmm5 );
1480 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
1481 C.store( i+1UL, j , xmm3+xmm7 );
1482 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
1487 const size_t kbegin( ( IsUpper_v<MT4> )
1488 ?( ( IsLower_v<MT5> )
1489 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1490 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1491 :( IsLower_v<MT5> ? j : 0UL ) );
1492 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1497 for( ; (k+2UL) <= kend; k+=2UL ) {
1498 const SIMDType a1(
set( A(i,k ) ) );
1499 const SIMDType a2(
set( A(i,k+1UL) ) );
1500 xmm1 += a1 * B.load(k ,j );
1501 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
1502 xmm3 += a2 * B.load(k+1UL,j );
1503 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
1506 for( ; k<kend; ++k ) {
1507 const SIMDType a1(
set( A(i,k) ) );
1508 xmm1 += a1 * B.load(k,j );
1512 C.store( i, j , xmm1+xmm3 );
1513 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
1520 size_t i(
LOW ? j : 0UL );
1522 for( ; (i+4UL) <= iend; i+=4UL )
1524 const size_t kbegin( ( IsUpper_v<MT4> )
1525 ?( ( IsLower_v<MT5> )
1526 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1527 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1528 :( IsLower_v<MT5> ? j : 0UL ) );
1529 const size_t kend( ( IsLower_v<MT4> )
1530 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1533 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1536 for( ; (k+2UL) <= kend; k+=2UL ) {
1538 const SIMDType b2( B.load(k+1UL,j) );
1539 xmm1 +=
set( A(i ,k ) ) * b1;
1540 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1541 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1542 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1543 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1544 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1545 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1546 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1549 for( ; k<kend; ++k ) {
1551 xmm1 +=
set( A(i ,k) ) * b1;
1552 xmm2 +=
set( A(i+1UL,k) ) * b1;
1553 xmm3 +=
set( A(i+2UL,k) ) * b1;
1554 xmm4 +=
set( A(i+3UL,k) ) * b1;
1557 C.store( i , j, xmm1+xmm5 );
1558 C.store( i+1UL, j, xmm2+xmm6 );
1559 C.store( i+2UL, j, xmm3+xmm7 );
1560 C.store( i+3UL, j, xmm4+xmm8 );
1563 for( ; (i+3UL) <= iend; i+=3UL )
1565 const size_t kbegin( ( IsUpper_v<MT4> )
1566 ?( ( IsLower_v<MT5> )
1567 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1568 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1569 :( IsLower_v<MT5> ? j : 0UL ) );
1570 const size_t kend( ( IsLower_v<MT4> )
1571 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1574 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1577 for( ; (k+2UL) <= kend; k+=2UL ) {
1579 const SIMDType b2( B.load(k+1UL,j) );
1580 xmm1 +=
set( A(i ,k ) ) * b1;
1581 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1582 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1583 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1584 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1585 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1588 for( ; k<kend; ++k ) {
1590 xmm1 +=
set( A(i ,k) ) * b1;
1591 xmm2 +=
set( A(i+1UL,k) ) * b1;
1592 xmm3 +=
set( A(i+2UL,k) ) * b1;
1595 C.store( i , j, xmm1+xmm4 );
1596 C.store( i+1UL, j, xmm2+xmm5 );
1597 C.store( i+2UL, j, xmm3+xmm6 );
1600 for( ; (i+2UL) <= iend; i+=2UL )
1602 const size_t kbegin( ( IsUpper_v<MT4> )
1603 ?( ( IsLower_v<MT5> )
1604 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1605 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1606 :( IsLower_v<MT5> ? j : 0UL ) );
1607 const size_t kend( ( IsLower_v<MT4> )
1608 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1614 for( ; (k+2UL) <= kend; k+=2UL ) {
1616 const SIMDType b2( B.load(k+1UL,j) );
1617 xmm1 +=
set( A(i ,k ) ) * b1;
1618 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1619 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1620 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1623 for( ; k<kend; ++k ) {
1625 xmm1 +=
set( A(i ,k) ) * b1;
1626 xmm2 +=
set( A(i+1UL,k) ) * b1;
1629 C.store( i , j, xmm1+xmm3 );
1630 C.store( i+1UL, j, xmm2+xmm4 );
1635 const size_t kbegin( ( IsUpper_v<MT4> )
1636 ?( ( IsLower_v<MT5> )
1637 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1638 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1639 :( IsLower_v<MT5> ? j : 0UL ) );
1644 for( ; (k+2UL) <= K; k+=2UL ) {
1645 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1646 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1650 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1653 C.store( i, j, xmm1+xmm2 );
1657 for( ; remainder && j<N; ++j )
1659 size_t i(
LOW &&
UPP ? j : 0UL );
1661 for( ; (i+2UL) <= M; i+=2UL )
1663 const size_t kbegin( ( IsUpper_v<MT4> )
1664 ?( ( IsLower_v<MT5> )
1665 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1666 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1667 :( IsLower_v<MT5> ? j : 0UL ) );
1668 const size_t kend( ( IsLower_v<MT4> )
1669 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1675 for(
size_t k=kbegin; k<kend; ++k ) {
1676 value1 += A(i ,k) * B(k,j);
1677 value2 += A(i+1UL,k) * B(k,j);
1681 C(i+1UL,j) = value2;
1686 const size_t kbegin( ( IsUpper_v<MT4> )
1687 ?( ( IsLower_v<MT5> )
1688 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1689 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1690 :( IsLower_v<MT5> ? j : 0UL ) );
1694 for(
size_t k=kbegin; k<K; ++k ) {
1695 value += A(i,k) * B(k,j);
1704 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
1706 for(
size_t j=0UL; j<jend; ++j ) {
1707 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1712 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
1714 for(
size_t i=0UL; i<iend; ++i ) {
1720 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
1722 for(
size_t j=0UL; j<jend; ++j ) {
1746 template<
typename MT3
1749 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1750 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1752 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
1754 const size_t M( A.rows() );
1755 const size_t N( B.columns() );
1756 const size_t K( A.columns() );
1760 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
1770 if( IsIntegral_v<ElementType> )
1773 for(
size_t j=0UL; j<N; ++j )
1775 const size_t kbegin( ( IsLower_v<MT5> )
1776 ?( ( IsUpper_v<MT4> )
1777 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1778 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1779 :( IsUpper_v<MT4> ? i : 0UL ) );
1780 const size_t kend( ( IsUpper_v<MT5> )
1781 ?( ( IsLower_v<MT4> )
1782 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1783 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
1784 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
1786 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1788 for(
size_t k=kbegin; k<kend; ++k ) {
1789 const SIMDType b1(
set( B(k,j) ) );
1790 xmm1 += A.load(i ,k) * b1;
1791 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1792 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1793 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1794 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1795 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
1796 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
1797 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
1800 C.store( i , j, xmm1 );
1802 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1803 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1804 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1805 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
1806 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
1807 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
1816 for( ; (j+2UL) <= N; j+=2UL )
1818 const size_t kbegin( ( IsLower_v<MT5> )
1819 ?( ( IsUpper_v<MT4> )
1820 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1821 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1822 :( IsUpper_v<MT4> ? i : 0UL ) );
1823 const size_t kend( ( IsUpper_v<MT5> )
1824 ?( ( IsLower_v<MT4> )
1825 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1826 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1827 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
1829 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1831 for(
size_t k=kbegin; k<kend; ++k ) {
1837 const SIMDType b1(
set( B(k,j ) ) );
1838 const SIMDType b2(
set( B(k,j+1UL) ) );
1851 C.store( i , j , xmm1 );
1853 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1854 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1855 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
1856 C.store( i , j+1UL, xmm6 );
1857 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
1858 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
1859 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
1860 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
1865 const size_t kbegin( ( IsLower_v<MT5> )
1866 ?( ( IsUpper_v<MT4> )
1867 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1868 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1869 :( IsUpper_v<MT4> ? i : 0UL ) );
1870 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
1872 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1874 for(
size_t k=kbegin; k<kend; ++k ) {
1875 const SIMDType b1(
set( B(k,j) ) );
1876 xmm1 += A.load(i ,k) * b1;
1877 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1878 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1879 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1880 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1883 C.store( i , j, xmm1 );
1885 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1886 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1887 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1894 size_t j(
UPP ? i : 0UL );
1896 for( ; (j+2UL) <= jend; j+=2UL )
1898 const size_t kbegin( ( IsLower_v<MT5> )
1899 ?( ( IsUpper_v<MT4> )
1900 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1901 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1902 :( IsUpper_v<MT4> ? i : 0UL ) );
1903 const size_t kend( ( IsUpper_v<MT5> )
1904 ?( ( IsLower_v<MT4> )
1905 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1906 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1907 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
1909 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1911 for(
size_t k=kbegin; k<kend; ++k ) {
1916 const SIMDType b1(
set( B(k,j ) ) );
1917 const SIMDType b2(
set( B(k,j+1UL) ) );
1928 C.store( i , j , xmm1 );
1930 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1931 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1932 C.store( i , j+1UL, xmm5 );
1933 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
1934 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
1935 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
1940 const size_t kbegin( ( IsLower_v<MT5> )
1941 ?( ( IsUpper_v<MT4> )
1942 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1943 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1944 :( IsUpper_v<MT4> ? i : 0UL ) );
1945 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
1949 for(
size_t k=kbegin; k<kend; ++k ) {
1950 const SIMDType b1(
set( B(k,j) ) );
1951 xmm1 += A.load(i ,k) * b1;
1952 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1953 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1954 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1957 C.store( i , j, xmm1 );
1959 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1960 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1967 size_t j(
UPP ? i : 0UL );
1969 for( ; (j+2UL) <= jend; j+=2UL )
1971 const size_t kbegin( ( IsLower_v<MT5> )
1972 ?( ( IsUpper_v<MT4> )
1973 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1974 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1975 :( IsUpper_v<MT4> ? i : 0UL ) );
1976 const size_t kend( ( IsUpper_v<MT5> )
1977 ?( ( IsLower_v<MT4> )
1978 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1979 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1980 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
1982 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1984 for(
size_t k=kbegin; k<kend; ++k ) {
1988 const SIMDType b1(
set( B(k,j ) ) );
1989 const SIMDType b2(
set( B(k,j+1UL) ) );
1998 C.store( i , j , xmm1 );
2000 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2001 C.store( i , j+1UL, xmm4 );
2002 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2003 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2008 const size_t kbegin( ( IsLower_v<MT5> )
2009 ?( ( IsUpper_v<MT4> )
2010 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2011 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2012 :( IsUpper_v<MT4> ? i : 0UL ) );
2013 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2017 for(
size_t k=kbegin; k<kend; ++k ) {
2018 const SIMDType b1(
set( B(k,j) ) );
2019 xmm1 += A.load(i ,k) * b1;
2020 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2021 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2024 C.store( i , j, xmm1 );
2026 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2033 size_t j(
UPP ? i : 0UL );
2035 for( ; (j+4UL) <= jend; j+=4UL )
2037 const size_t kbegin( ( IsLower_v<MT5> )
2038 ?( ( IsUpper_v<MT4> )
2039 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2040 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2041 :( IsUpper_v<MT4> ? i : 0UL ) );
2042 const size_t kend( ( IsUpper_v<MT5> )
2043 ?( ( IsLower_v<MT4> )
2044 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2045 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2046 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2048 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2050 for(
size_t k=kbegin; k<kend; ++k ) {
2053 const SIMDType b1(
set( B(k,j ) ) );
2054 const SIMDType b2(
set( B(k,j+1UL) ) );
2055 const SIMDType b3(
set( B(k,j+2UL) ) );
2056 const SIMDType b4(
set( B(k,j+3UL) ) );
2067 C.store( i , j , xmm1 );
2069 C.store( i , j+1UL, xmm3 );
2070 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2071 C.store( i , j+2UL, xmm5 );
2072 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2073 C.store( i , j+3UL, xmm7 );
2074 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2077 for( ; (j+3UL) <= jend; j+=3UL )
2079 const size_t kbegin( ( IsLower_v<MT5> )
2080 ?( ( IsUpper_v<MT4> )
2081 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2082 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2083 :( IsUpper_v<MT4> ? i : 0UL ) );
2084 const size_t kend( ( IsUpper_v<MT5> )
2085 ?( ( IsLower_v<MT4> )
2086 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2087 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2088 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2090 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2092 for(
size_t k=kbegin; k<kend; ++k ) {
2095 const SIMDType b1(
set( B(k,j ) ) );
2096 const SIMDType b2(
set( B(k,j+1UL) ) );
2097 const SIMDType b3(
set( B(k,j+2UL) ) );
2106 C.store( i , j , xmm1 );
2108 C.store( i , j+1UL, xmm3 );
2109 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2110 C.store( i , j+2UL, xmm5 );
2111 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2114 for( ; (j+2UL) <= jend; j+=2UL )
2116 const size_t kbegin( ( IsLower_v<MT5> )
2117 ?( ( IsUpper_v<MT4> )
2118 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2119 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2120 :( IsUpper_v<MT4> ? i : 0UL ) );
2121 const size_t kend( ( IsUpper_v<MT5> )
2122 ?( ( IsLower_v<MT4> )
2123 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2124 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2125 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2127 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2130 for( ; (k+2UL) <= kend; k+=2UL ) {
2131 const SIMDType a1( A.load(i ,k ) );
2133 const SIMDType a3( A.load(i ,k+1UL) );
2135 const SIMDType b1(
set( B(k ,j ) ) );
2136 const SIMDType b2(
set( B(k ,j+1UL) ) );
2137 const SIMDType b3(
set( B(k+1UL,j ) ) );
2138 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2149 for( ; k<kend; ++k ) {
2152 const SIMDType b1(
set( B(k,j ) ) );
2153 const SIMDType b2(
set( B(k,j+1UL) ) );
2160 C.store( i , j , xmm1+xmm5 );
2161 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
2162 C.store( i , j+1UL, xmm3+xmm7 );
2163 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
2168 const size_t kbegin( ( IsLower_v<MT5> )
2169 ?( ( IsUpper_v<MT4> )
2170 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2171 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2172 :( IsUpper_v<MT4> ? i : 0UL ) );
2173 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2178 for( ; (k+2UL) <= kend; k+=2UL ) {
2179 const SIMDType b1(
set( B(k ,j) ) );
2180 const SIMDType b2(
set( B(k+1UL,j) ) );
2181 xmm1 += A.load(i ,k ) * b1;
2182 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
2183 xmm3 += A.load(i ,k+1UL) * b2;
2184 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
2187 for( ; k<kend; ++k ) {
2188 const SIMDType b1(
set( B(k,j) ) );
2189 xmm1 += A.load(i ,k) * b1;
2193 C.store( i , j, xmm1+xmm3 );
2194 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
2201 size_t j(
UPP ? i : 0UL );
2203 for( ; (j+4UL) <= jend; j+=4UL )
2205 const size_t kbegin( ( IsLower_v<MT5> )
2206 ?( ( IsUpper_v<MT4> )
2207 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2208 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2209 :( IsUpper_v<MT4> ? i : 0UL ) );
2210 const size_t kend( ( IsUpper_v<MT5> )
2211 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2214 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2217 for( ; (k+2UL) <= kend; k+=2UL ) {
2219 const SIMDType a2( A.load(i,k+1UL) );
2220 xmm1 += a1 *
set( B(k ,j ) );
2221 xmm2 += a1 *
set( B(k ,j+1UL) );
2222 xmm3 += a1 *
set( B(k ,j+2UL) );
2223 xmm4 += a1 *
set( B(k ,j+3UL) );
2224 xmm5 += a2 *
set( B(k+1UL,j ) );
2225 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2226 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2227 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2230 for( ; k<kend; ++k ) {
2232 xmm1 += a1 *
set( B(k,j ) );
2233 xmm2 += a1 *
set( B(k,j+1UL) );
2234 xmm3 += a1 *
set( B(k,j+2UL) );
2235 xmm4 += a1 *
set( B(k,j+3UL) );
2238 C.store( i, j , xmm1+xmm5 );
2239 C.store( i, j+1UL, xmm2+xmm6 );
2240 C.store( i, j+2UL, xmm3+xmm7 );
2241 C.store( i, j+3UL, xmm4+xmm8 );
2244 for( ; (j+3UL) <= jend; j+=3UL )
2246 const size_t kbegin( ( IsLower_v<MT5> )
2247 ?( ( IsUpper_v<MT4> )
2248 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2249 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2250 :( IsUpper_v<MT4> ? i : 0UL ) );
2251 const size_t kend( ( IsUpper_v<MT5> )
2252 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2255 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2258 for( ; (k+2UL) <= kend; k+=2UL ) {
2260 const SIMDType a2( A.load(i,k+1UL) );
2261 xmm1 += a1 *
set( B(k ,j ) );
2262 xmm2 += a1 *
set( B(k ,j+1UL) );
2263 xmm3 += a1 *
set( B(k ,j+2UL) );
2264 xmm4 += a2 *
set( B(k+1UL,j ) );
2265 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2266 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2269 for( ; k<kend; ++k ) {
2271 xmm1 += a1 *
set( B(k,j ) );
2272 xmm2 += a1 *
set( B(k,j+1UL) );
2273 xmm3 += a1 *
set( B(k,j+2UL) );
2276 C.store( i, j , xmm1+xmm4 );
2277 C.store( i, j+1UL, xmm2+xmm5 );
2278 C.store( i, j+2UL, xmm3+xmm6 );
2281 for( ; (j+2UL) <= jend; j+=2UL )
2283 const size_t kbegin( ( IsLower_v<MT5> )
2284 ?( ( IsUpper_v<MT4> )
2285 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2286 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2287 :( IsUpper_v<MT4> ? i : 0UL ) );
2288 const size_t kend( ( IsUpper_v<MT5> )
2289 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2295 for( ; (k+2UL) <= kend; k+=2UL ) {
2297 const SIMDType a2( A.load(i,k+1UL) );
2298 xmm1 += a1 *
set( B(k ,j ) );
2299 xmm2 += a1 *
set( B(k ,j+1UL) );
2300 xmm3 += a2 *
set( B(k+1UL,j ) );
2301 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2304 for( ; k<kend; ++k ) {
2306 xmm1 += a1 *
set( B(k,j ) );
2307 xmm2 += a1 *
set( B(k,j+1UL) );
2310 C.store( i, j , xmm1+xmm3 );
2311 C.store( i, j+1UL, xmm2+xmm4 );
2316 const size_t kbegin( ( IsLower_v<MT5> )
2317 ?( ( IsUpper_v<MT4> )
2318 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2319 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2320 :( IsUpper_v<MT4> ? i : 0UL ) );
2325 for( ; (k+2UL) <= K; k+=2UL ) {
2326 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2327 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2331 xmm1 += A.load(i,k) *
set( B(k,j) );
2334 C.store( i, j, xmm1+xmm2 );
2338 for( ; remainder && i<M; ++i )
2340 size_t j(
LOW &&
UPP ? i : 0UL );
2342 for( ; (j+2UL) <= N; j+=2UL )
2344 const size_t kbegin( ( IsLower_v<MT5> )
2345 ?( ( IsUpper_v<MT4> )
2346 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2347 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2348 :( IsUpper_v<MT4> ? i : 0UL ) );
2349 const size_t kend( ( IsUpper_v<MT5> )
2350 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2356 for(
size_t k=kbegin; k<kend; ++k ) {
2357 value1 += A(i,k) * B(k,j );
2358 value2 += A(i,k) * B(k,j+1UL);
2362 C(i,j+1UL) = value2;
2367 const size_t kbegin( ( IsLower_v<MT5> )
2368 ?( ( IsUpper_v<MT4> )
2369 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2370 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2371 :( IsUpper_v<MT4> ? i : 0UL ) );
2375 for(
size_t k=kbegin; k<K; ++k ) {
2376 value += A(i,k) * B(k,j);
2385 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
2387 for(
size_t i=0UL; i<iend; ++i ) {
2388 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
2393 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
2395 for(
size_t i=0UL; i<iend; ++i ) {
2401 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
2403 for(
size_t j=0UL; j<jend; ++j ) {
2426 template<
typename MT3
2429 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2430 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2432 selectDefaultAssignKernel( C, A, B );
2452 template<
typename MT3
2455 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2456 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2486 template<
typename MT3
2489 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2490 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2492 selectLargeAssignKernel( C, A, B );
2498 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2512 template<
typename MT3
2515 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2516 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2518 using ET = ElementType_t<MT3>;
2520 if( IsTriangular_v<MT4> ) {
2522 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2524 else if( IsTriangular_v<MT5> ) {
2526 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2529 gemm( C, A, B, ET(1), ET(0) );
2549 template<
typename MT
2551 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2555 using TmpType = If_t< SO, ResultType, OppositeType >;
2567 const ForwardFunctor fwd;
2569 const TmpType tmp(
serial( rhs ) );
2570 assign( ~lhs, fwd( tmp ) );
2588 template<
typename MT
2590 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2597 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2611 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2627 template<
typename MT3
2630 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2632 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
2633 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
2634 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
2635 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2636 selectSmallAddAssignKernel( C, A, B );
2638 selectBlasAddAssignKernel( C, A, B );
2657 template<
typename MT3
2660 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2661 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2663 const size_t M( A.rows() );
2664 const size_t N( B.columns() );
2665 const size_t K( A.columns() );
2669 for(
size_t i=0UL; i<M; ++i )
2671 const size_t kbegin( ( IsUpper_v<MT4> )
2672 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2674 const size_t kend( ( IsLower_v<MT4> )
2675 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2679 for(
size_t k=kbegin; k<kend; ++k )
2681 const size_t jbegin( ( IsUpper_v<MT5> )
2682 ?( ( IsStrictlyUpper_v<MT5> )
2683 ?(
UPP ?
max(i,k+1UL) : k+1UL )
2684 :(
UPP ?
max(i,k) : k ) )
2685 :(
UPP ? i : 0UL ) );
2686 const size_t jend( ( IsLower_v<MT5> )
2687 ?( ( IsStrictlyLower_v<MT5> )
2688 ?(
LOW ?
min(i+1UL,k) : k )
2689 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
2690 :(
LOW ? i+1UL : N ) );
2692 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
2695 const size_t jnum( jend - jbegin );
2696 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2698 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2699 C(i,j ) += A(i,k) * B(k,j );
2700 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2703 C(i,jpos) += A(i,k) * B(k,jpos);
2725 template<
typename MT3
2728 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2729 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2731 const size_t M( A.rows() );
2732 const size_t N( B.columns() );
2733 const size_t K( A.columns() );
2737 for(
size_t j=0UL; j<N; ++j )
2739 const size_t kbegin( ( IsLower_v<MT5> )
2740 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2742 const size_t kend( ( IsUpper_v<MT5> )
2743 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2747 for(
size_t k=kbegin; k<kend; ++k )
2749 const size_t ibegin( ( IsLower_v<MT4> )
2750 ?( ( IsStrictlyLower_v<MT4> )
2751 ?(
LOW ?
max(j,k+1UL) : k+1UL )
2752 :(
LOW ?
max(j,k) : k ) )
2753 :(
LOW ? j : 0UL ) );
2754 const size_t iend( ( IsUpper_v<MT4> )
2755 ?( ( IsStrictlyUpper_v<MT4> )
2756 ?(
UPP ?
min(j+1UL,k) : k )
2757 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
2758 :(
UPP ? j+1UL : M ) );
2760 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
2763 const size_t inum( iend - ibegin );
2764 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2766 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2767 C(i ,j) += A(i ,k) * B(k,j);
2768 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2771 C(ipos,j) += A(ipos,k) * B(k,j);
2793 template<
typename MT3
2796 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2797 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2799 constexpr
size_t block( BLOCK_SIZE );
2801 const size_t M( A.rows() );
2802 const size_t N( B.columns() );
2804 for(
size_t ii=0UL; ii<M; ii+=block ) {
2805 const size_t iend(
min( M, ii+block ) );
2806 for(
size_t jj=0UL; jj<N; jj+=block ) {
2807 const size_t jend(
min( N, jj+block ) );
2808 for(
size_t i=ii; i<iend; ++i )
2810 const size_t jbegin( ( IsUpper_v<MT4> )
2811 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
2813 const size_t jpos( ( IsLower_v<MT4> )
2814 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
2817 for(
size_t j=jbegin; j<jpos; ++j ) {
2818 C(i,j) += A(i,j) * B(j,j);
2841 template<
typename MT3
2844 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2845 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2847 const size_t M( A.rows() );
2848 const size_t N( B.columns() );
2850 for(
size_t j=0UL; j<N; ++j )
2852 const size_t ibegin( ( IsLower_v<MT4> )
2853 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2855 const size_t iend( ( IsUpper_v<MT4> )
2856 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2860 const size_t inum( iend - ibegin );
2861 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2863 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2864 C(i ,j) += A(i ,j) * B(j,j);
2865 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2868 C(ipos,j) += A(ipos,j) * B(j,j);
2889 template<
typename MT3
2892 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2893 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2895 const size_t M( A.rows() );
2896 const size_t N( B.columns() );
2898 for(
size_t i=0UL; i<M; ++i )
2900 const size_t jbegin( ( IsUpper_v<MT5> )
2901 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2903 const size_t jend( ( IsLower_v<MT5> )
2904 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2908 const size_t jnum( jend - jbegin );
2909 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2911 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2912 C(i,j ) += A(i,i) * B(i,j );
2913 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2916 C(i,jpos) += A(i,i) * B(i,jpos);
2937 template<
typename MT3
2940 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2941 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2943 constexpr
size_t block( BLOCK_SIZE );
2945 const size_t M( A.rows() );
2946 const size_t N( B.columns() );
2948 for(
size_t jj=0UL; jj<N; jj+=block ) {
2949 const size_t jend(
min( N, jj+block ) );
2950 for(
size_t ii=0UL; ii<M; ii+=block ) {
2951 const size_t iend(
min( M, ii+block ) );
2952 for(
size_t j=jj; j<jend; ++j )
2954 const size_t ibegin( ( IsLower_v<MT5> )
2955 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
2957 const size_t ipos( ( IsUpper_v<MT5> )
2958 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
2961 for(
size_t i=ibegin; i<ipos; ++i ) {
2962 C(i,j) += A(i,i) * B(i,j);
2985 template<
typename MT3
2988 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2989 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2991 for(
size_t i=0UL; i<A.rows(); ++i ) {
2992 C(i,i) += A(i,i) * B(i,i);
3012 template<
typename MT3
3015 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3016 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3018 selectDefaultAddAssignKernel( C, A, B );
3038 template<
typename MT3
3041 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3042 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3044 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3046 const size_t M( A.rows() );
3047 const size_t N( B.columns() );
3048 const size_t K( A.columns() );
3052 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3057 if( IsIntegral_v<ElementType> )
3060 for(
size_t i=0UL; i<M; ++i )
3062 const size_t kbegin( ( IsUpper_v<MT4> )
3063 ?( ( IsLower_v<MT5> )
3064 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3065 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3066 :( IsLower_v<MT5> ? j : 0UL ) );
3067 const size_t kend( ( IsLower_v<MT4> )
3068 ?( ( IsUpper_v<MT5> )
3069 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3070 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3071 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3082 for(
size_t k=kbegin; k<kend; ++k ) {
3083 const SIMDType a1(
set( A(i,k) ) );
3084 xmm1 += a1 * B.load(k,j );
3085 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3086 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3087 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3088 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3089 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
3090 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
3091 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
3094 C.store( i, j , xmm1 );
3096 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3097 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3098 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3099 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3100 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3101 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3110 for( ; (i+2UL) <= M; i+=2UL )
3112 const size_t kbegin( ( IsUpper_v<MT4> )
3113 ?( ( IsLower_v<MT5> )
3114 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3115 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3116 :( IsLower_v<MT5> ? j : 0UL ) );
3117 const size_t kend( ( IsLower_v<MT4> )
3118 ?( ( IsUpper_v<MT5> )
3119 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3120 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3121 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3128 SIMDType xmm6 ( C.load(i+1UL,j ) );
3134 for(
size_t k=kbegin; k<kend; ++k ) {
3135 const SIMDType a1(
set( A(i ,k) ) );
3136 const SIMDType a2(
set( A(i+1UL,k) ) );
3154 C.store( i , j , xmm1 );
3156 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3157 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3158 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3159 C.store( i+1UL, j , xmm6 );
3160 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3161 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3162 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3163 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3168 const size_t kbegin( ( IsUpper_v<MT4> )
3169 ?( ( IsLower_v<MT5> )
3170 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3171 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3172 :( IsLower_v<MT5> ? j : 0UL ) );
3173 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3181 for(
size_t k=kbegin; k<kend; ++k ) {
3182 const SIMDType a1(
set( A(i,k) ) );
3183 xmm1 += a1 * B.load(k,j );
3184 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3185 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3186 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3187 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3190 C.store( i, j , xmm1 );
3192 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3193 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3194 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3202 for( ; (i+2UL) <= M; i+=2UL )
3204 const size_t kbegin( ( IsUpper_v<MT4> )
3205 ?( ( IsLower_v<MT5> )
3206 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3207 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3208 :( IsLower_v<MT5> ? j : 0UL ) );
3209 const size_t kend( ( IsLower_v<MT4> )
3210 ?( ( IsUpper_v<MT5> )
3211 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3212 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3213 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3224 for(
size_t k=kbegin; k<kend; ++k ) {
3225 const SIMDType a1(
set( A(i ,k) ) );
3226 const SIMDType a2(
set( A(i+1UL,k) ) );
3241 C.store( i , j , xmm1 );
3243 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3244 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3245 C.store( i+1UL, j , xmm5 );
3246 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3247 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3248 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3253 const size_t kbegin( ( IsUpper_v<MT4> )
3254 ?( ( IsLower_v<MT5> )
3255 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3256 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3257 :( IsLower_v<MT5> ? j : 0UL ) );
3258 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3265 for(
size_t k=kbegin; k<kend; ++k ) {
3266 const SIMDType a1(
set( A(i,k) ) );
3267 xmm1 += a1 * B.load(k,j );
3268 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3269 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3270 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3273 C.store( i, j , xmm1 );
3275 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3276 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3284 for( ; (i+2UL) <= M; i+=2UL )
3286 const size_t kbegin( ( IsUpper_v<MT4> )
3287 ?( ( IsLower_v<MT5> )
3288 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3289 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3290 :( IsLower_v<MT5> ? j : 0UL ) );
3291 const size_t kend( ( IsLower_v<MT4> )
3292 ?( ( IsUpper_v<MT5> )
3293 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3294 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3295 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
3304 for(
size_t k=kbegin; k<kend; ++k ) {
3305 const SIMDType a1(
set( A(i ,k) ) );
3306 const SIMDType a2(
set( A(i+1UL,k) ) );
3318 C.store( i , j , xmm1 );
3320 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3321 C.store( i+1UL, j , xmm4 );
3322 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
3323 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
3328 const size_t kbegin( ( IsUpper_v<MT4> )
3329 ?( ( IsLower_v<MT5> )
3330 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3331 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3332 :( IsLower_v<MT5> ? j : 0UL ) );
3333 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
3339 for(
size_t k=kbegin; k<kend; ++k ) {
3340 const SIMDType a1(
set( A(i,k) ) );
3341 xmm1 += a1 * B.load(k,j );
3342 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3343 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3346 C.store( i, j , xmm1 );
3348 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3355 size_t i(
LOW ? j : 0UL );
3357 for( ; (i+4UL) <= iend; i+=4UL )
3359 const size_t kbegin( ( IsUpper_v<MT4> )
3360 ?( ( IsLower_v<MT5> )
3361 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3362 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3363 :( IsLower_v<MT5> ? j : 0UL ) );
3364 const size_t kend( ( IsLower_v<MT4> )
3365 ?( ( IsUpper_v<MT5> )
3366 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
3367 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3368 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3379 for(
size_t k=kbegin; k<kend; ++k ) {
3380 const SIMDType a1(
set( A(i ,k) ) );
3381 const SIMDType a2(
set( A(i+1UL,k) ) );
3382 const SIMDType a3(
set( A(i+2UL,k) ) );
3383 const SIMDType a4(
set( A(i+3UL,k) ) );
3396 C.store( i , j , xmm1 );
3398 C.store( i+1UL, j , xmm3 );
3399 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3400 C.store( i+2UL, j , xmm5 );
3401 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3402 C.store( i+3UL, j , xmm7 );
3403 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
3406 for( ; (i+3UL) <= iend; i+=3UL )
3408 const size_t kbegin( ( IsUpper_v<MT4> )
3409 ?( ( IsLower_v<MT5> )
3410 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3411 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3412 :( IsLower_v<MT5> ? j : 0UL ) );
3413 const size_t kend( ( IsLower_v<MT4> )
3414 ?( ( IsUpper_v<MT5> )
3415 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
3416 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3417 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3426 for(
size_t k=kbegin; k<kend; ++k ) {
3427 const SIMDType a1(
set( A(i ,k) ) );
3428 const SIMDType a2(
set( A(i+1UL,k) ) );
3429 const SIMDType a3(
set( A(i+2UL,k) ) );
3440 C.store( i , j , xmm1 );
3442 C.store( i+1UL, j , xmm3 );
3443 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3444 C.store( i+2UL, j , xmm5 );
3445 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3448 for( ; (i+2UL) <= iend; i+=2UL )
3450 const size_t kbegin( ( IsUpper_v<MT4> )
3451 ?( ( IsLower_v<MT5> )
3452 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3453 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3454 :( IsLower_v<MT5> ? j : 0UL ) );
3455 const size_t kend( ( IsLower_v<MT4> )
3456 ?( ( IsUpper_v<MT5> )
3457 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
3458 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3459 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3468 for( ; (k+2UL) <= kend; k+=2UL ) {
3469 const SIMDType a1(
set( A(i ,k ) ) );
3470 const SIMDType a2(
set( A(i+1UL,k ) ) );
3471 const SIMDType a3(
set( A(i ,k+1UL) ) );
3472 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
3473 const SIMDType b1( B.load(k ,j ) );
3475 const SIMDType b3( B.load(k+1UL,j ) );
3487 for( ; k<kend; ++k ) {
3488 const SIMDType a1(
set( A(i ,k) ) );
3489 const SIMDType a2(
set( A(i+1UL,k) ) );
3498 C.store( i , j , xmm1+xmm5 );
3499 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
3500 C.store( i+1UL, j , xmm3+xmm7 );
3501 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
3506 const size_t kbegin( ( IsUpper_v<MT4> )
3507 ?( ( IsLower_v<MT5> )
3508 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3509 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3510 :( IsLower_v<MT5> ? j : 0UL ) );
3511 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
3518 for( ; (k+2UL) <= kend; k+=2UL ) {
3519 const SIMDType a1(
set( A(i,k ) ) );
3520 const SIMDType a2(
set( A(i,k+1UL) ) );
3521 xmm1 += a1 * B.load(k ,j );
3522 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
3523 xmm3 += a2 * B.load(k+1UL,j );
3524 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
3527 for( ; k<kend; ++k ) {
3528 const SIMDType a1(
set( A(i,k) ) );
3529 xmm1 += a1 * B.load(k,j );
3533 C.store( i, j , xmm1+xmm3 );
3534 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
3541 size_t i(
LOW ? j : 0UL );
3543 for( ; (i+4UL) <= iend; i+=4UL )
3545 const size_t kbegin( ( IsUpper_v<MT4> )
3546 ?( ( IsLower_v<MT5> )
3547 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3548 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3549 :( IsLower_v<MT5> ? j : 0UL ) );
3550 const size_t kend( ( IsLower_v<MT4> )
3551 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3561 for( ; (k+2UL) <= kend; k+=2UL ) {
3563 const SIMDType b2( B.load(k+1UL,j) );
3564 xmm1 +=
set( A(i ,k ) ) * b1;
3565 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3566 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3567 xmm4 +=
set( A(i+3UL,k ) ) * b1;
3568 xmm5 +=
set( A(i ,k+1UL) ) * b2;
3569 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
3570 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
3571 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
3574 for( ; k<kend; ++k ) {
3576 xmm1 +=
set( A(i ,k) ) * b1;
3577 xmm2 +=
set( A(i+1UL,k) ) * b1;
3578 xmm3 +=
set( A(i+2UL,k) ) * b1;
3579 xmm4 +=
set( A(i+3UL,k) ) * b1;
3582 C.store( i , j, xmm1+xmm5 );
3583 C.store( i+1UL, j, xmm2+xmm6 );
3584 C.store( i+2UL, j, xmm3+xmm7 );
3585 C.store( i+3UL, j, xmm4+xmm8 );
3588 for( ; (i+3UL) <= iend; i+=3UL )
3590 const size_t kbegin( ( IsUpper_v<MT4> )
3591 ?( ( IsLower_v<MT5> )
3592 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3593 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3594 :( IsLower_v<MT5> ? j : 0UL ) );
3595 const size_t kend( ( IsLower_v<MT4> )
3596 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3605 for( ; (k+2UL) <= kend; k+=2UL ) {
3607 const SIMDType b2( B.load(k+1UL,j) );
3608 xmm1 +=
set( A(i ,k ) ) * b1;
3609 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3610 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3611 xmm4 +=
set( A(i ,k+1UL) ) * b2;
3612 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
3613 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
3616 for( ; k<kend; ++k ) {
3618 xmm1 +=
set( A(i ,k) ) * b1;
3619 xmm2 +=
set( A(i+1UL,k) ) * b1;
3620 xmm3 +=
set( A(i+2UL,k) ) * b1;
3623 C.store( i , j, xmm1+xmm4 );
3624 C.store( i+1UL, j, xmm2+xmm5 );
3625 C.store( i+2UL, j, xmm3+xmm6 );
3628 for( ; (i+2UL) <= iend; i+=2UL )
3630 const size_t kbegin( ( IsUpper_v<MT4> )
3631 ?( ( IsLower_v<MT5> )
3632 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3633 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3634 :( IsLower_v<MT5> ? j : 0UL ) );
3635 const size_t kend( ( IsLower_v<MT4> )
3636 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3644 for( ; (k+2UL) <= kend; k+=2UL ) {
3646 const SIMDType b2( B.load(k+1UL,j) );
3647 xmm1 +=
set( A(i ,k ) ) * b1;
3648 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3649 xmm3 +=
set( A(i ,k+1UL) ) * b2;
3650 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
3653 for( ; k<kend; ++k ) {
3655 xmm1 +=
set( A(i ,k) ) * b1;
3656 xmm2 +=
set( A(i+1UL,k) ) * b1;
3659 C.store( i , j, xmm1+xmm3 );
3660 C.store( i+1UL, j, xmm2+xmm4 );
3665 const size_t kbegin( ( IsUpper_v<MT4> )
3666 ?( ( IsLower_v<MT5> )
3667 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3668 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3669 :( IsLower_v<MT5> ? j : 0UL ) );
3675 for( ; (k+2UL) <= K; k+=2UL ) {
3676 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
3677 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3681 xmm1 +=
set( A(i,k) ) * B.load(k,j);
3684 C.store( i, j, xmm1+xmm2 );
3688 for( ; remainder && j<N; ++j )
3690 const size_t iend(
UPP ? j+1UL : M );
3691 size_t i(
LOW ? j : 0UL );
3693 for( ; (i+2UL) <= iend; i+=2UL )
3695 const size_t kbegin( ( IsUpper_v<MT4> )
3696 ?( ( IsLower_v<MT5> )
3697 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3698 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3699 :( IsLower_v<MT5> ? j : 0UL ) );
3700 const size_t kend( ( IsLower_v<MT4> )
3701 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3707 for(
size_t k=kbegin; k<kend; ++k ) {
3708 value1 += A(i ,k) * B(k,j);
3709 value2 += A(i+1UL,k) * B(k,j);
3713 C(i+1UL,j) = value2;
3718 const size_t kbegin( ( IsUpper_v<MT4> )
3719 ?( ( IsLower_v<MT5> )
3720 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3721 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3722 :( IsLower_v<MT5> ? j : 0UL ) );
3726 for(
size_t k=kbegin; k<K; ++k ) {
3727 value += A(i,k) * B(k,j);
3752 template<
typename MT3
3755 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3756 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3758 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3760 const size_t M( A.rows() );
3761 const size_t N( B.columns() );
3762 const size_t K( A.columns() );
3766 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
3771 if( IsIntegral_v<ElementType> )
3774 for(
size_t j=0UL; j<N; ++j )
3776 const size_t kbegin( ( IsLower_v<MT5> )
3777 ?( ( IsUpper_v<MT4> )
3778 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3779 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3780 :( IsUpper_v<MT4> ? i : 0UL ) );
3781 const size_t kend( ( IsUpper_v<MT5> )
3782 ?( ( IsLower_v<MT4> )
3783 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3784 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3785 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
3796 for(
size_t k=kbegin; k<kend; ++k ) {
3797 const SIMDType b1(
set( B(k,j) ) );
3798 xmm1 += A.load(i ,k) * b1;
3799 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
3800 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
3801 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
3802 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
3803 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
3804 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
3805 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
3808 C.store( i , j, xmm1 );
3810 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3811 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3812 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3813 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
3814 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
3815 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
3824 for( ; (j+2UL) <= N; j+=2UL )
3826 const size_t kbegin( ( IsLower_v<MT5> )
3827 ?( ( IsUpper_v<MT4> )
3828 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3829 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3830 :( IsUpper_v<MT4> ? i : 0UL ) );
3831 const size_t kend( ( IsUpper_v<MT5> )
3832 ?( ( IsLower_v<MT4> )
3833 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3834 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3835 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
3842 SIMDType xmm6 ( C.load(i ,j+1UL) );
3848 for(
size_t k=kbegin; k<kend; ++k ) {
3854 const SIMDType b1(
set( B(k,j ) ) );
3855 const SIMDType b2(
set( B(k,j+1UL) ) );
3868 C.store( i , j , xmm1 );
3870 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3871 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3872 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
3873 C.store( i , j+1UL, xmm6 );
3874 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
3875 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
3876 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
3877 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
3882 const size_t kbegin( ( IsLower_v<MT5> )
3883 ?( ( IsUpper_v<MT4> )
3884 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3885 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3886 :( IsUpper_v<MT4> ? i : 0UL ) );
3887 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
3895 for(
size_t k=kbegin; k<kend; ++k ) {
3896 const SIMDType b1(
set( B(k,j) ) );
3897 xmm1 += A.load(i ,k) * b1;
3898 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
3899 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
3900 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
3901 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
3904 C.store( i , j, xmm1 );
3906 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3907 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3908 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3916 for( ; (j+2UL) <= N; j+=2UL )
3918 const size_t kbegin( ( IsLower_v<MT5> )
3919 ?( ( IsUpper_v<MT4> )
3920 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3921 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3922 :( IsUpper_v<MT4> ? i : 0UL ) );
3923 const size_t kend( ( IsUpper_v<MT5> )
3924 ?( ( IsLower_v<MT4> )
3925 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3926 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3927 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
3938 for(
size_t k=kbegin; k<kend; ++k ) {
3943 const SIMDType b1(
set( B(k,j ) ) );
3944 const SIMDType b2(
set( B(k,j+1UL) ) );
3955 C.store( i , j , xmm1 );
3957 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3958 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3959 C.store( i , j+1UL, xmm5 );
3960 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
3961 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
3962 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
3967 const size_t kbegin( ( IsLower_v<MT5> )
3968 ?( ( IsUpper_v<MT4> )
3969 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3970 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3971 :( IsUpper_v<MT4> ? i : 0UL ) );
3972 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
3979 for(
size_t k=kbegin; k<kend; ++k ) {
3980 const SIMDType b1(
set( B(k,j) ) );
3981 xmm1 += A.load(i ,k) * b1;
3982 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
3983 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
3984 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
3987 C.store( i , j, xmm1 );
3989 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3990 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3998 for( ; (j+2UL) <= N; j+=2UL )
4000 const size_t kbegin( ( IsLower_v<MT5> )
4001 ?( ( IsUpper_v<MT4> )
4002 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4003 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4004 :( IsUpper_v<MT4> ? i : 0UL ) );
4005 const size_t kend( ( IsUpper_v<MT5> )
4006 ?( ( IsLower_v<MT4> )
4007 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4008 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4009 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
4018 for(
size_t k=kbegin; k<kend; ++k ) {
4022 const SIMDType b1(
set( B(k,j ) ) );
4023 const SIMDType b2(
set( B(k,j+1UL) ) );
4032 C.store( i , j , xmm1 );
4034 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4035 C.store( i , j+1UL, xmm4 );
4036 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
4037 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
4042 const size_t kbegin( ( IsLower_v<MT5> )
4043 ?( ( IsUpper_v<MT4> )
4044 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4045 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4046 :( IsUpper_v<MT4> ? i : 0UL ) );
4047 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
4053 for(
size_t k=kbegin; k<kend; ++k ) {
4054 const SIMDType b1(
set( B(k,j) ) );
4055 xmm1 += A.load(i ,k) * b1;
4056 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4057 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4060 C.store( i , j, xmm1 );
4062 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4069 size_t j(
UPP ? i : 0UL );
4071 for( ; (j+4UL) <= jend; j+=4UL )
4073 const size_t kbegin( ( IsLower_v<MT5> )
4074 ?( ( IsUpper_v<MT4> )
4075 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4076 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4077 :( IsUpper_v<MT4> ? i : 0UL ) );
4078 const size_t kend( ( IsUpper_v<MT5> )
4079 ?( ( IsLower_v<MT4> )
4080 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4081 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4082 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4093 for(
size_t k=kbegin; k<kend; ++k ) {
4096 const SIMDType b1(
set( B(k,j ) ) );
4097 const SIMDType b2(
set( B(k,j+1UL) ) );
4098 const SIMDType b3(
set( B(k,j+2UL) ) );
4099 const SIMDType b4(
set( B(k,j+3UL) ) );
4110 C.store( i , j , xmm1 );
4112 C.store( i , j+1UL, xmm3 );
4113 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4114 C.store( i , j+2UL, xmm5 );
4115 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4116 C.store( i , j+3UL, xmm7 );
4117 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
4120 for( ; (j+3UL) <= jend; j+=3UL )
4122 const size_t kbegin( ( IsLower_v<MT5> )
4123 ?( ( IsUpper_v<MT4> )
4124 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4125 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4126 :( IsUpper_v<MT4> ? i : 0UL ) );
4127 const size_t kend( ( IsUpper_v<MT5> )
4128 ?( ( IsLower_v<MT4> )
4129 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4130 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4131 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4140 for(
size_t k=kbegin; k<kend; ++k ) {
4143 const SIMDType b1(
set( B(k,j ) ) );
4144 const SIMDType b2(
set( B(k,j+1UL) ) );
4145 const SIMDType b3(
set( B(k,j+2UL) ) );
4154 C.store( i , j , xmm1 );
4156 C.store( i , j+1UL, xmm3 );
4157 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4158 C.store( i , j+2UL, xmm5 );
4159 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4162 for( ; (j+2UL) <= jend; j+=2UL )
4164 const size_t kbegin( ( IsLower_v<MT5> )
4165 ?( ( IsUpper_v<MT4> )
4166 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4167 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4168 :( IsUpper_v<MT4> ? i : 0UL ) );
4169 const size_t kend( ( IsUpper_v<MT5> )
4170 ?( ( IsLower_v<MT4> )
4171 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4172 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4173 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4182 for( ; (k+2UL) < kend; k+=2UL ) {
4183 const SIMDType a1( A.load(i ,k ) );
4185 const SIMDType a3( A.load(i ,k+1UL) );
4187 const SIMDType b1(
set( B(k ,j ) ) );
4188 const SIMDType b2(
set( B(k ,j+1UL) ) );
4189 const SIMDType b3(
set( B(k+1UL,j ) ) );
4190 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
4201 for( ; k<kend; ++k ) {
4204 const SIMDType b1(
set( B(k,j ) ) );
4205 const SIMDType b2(
set( B(k,j+1UL) ) );
4212 C.store( i , j , xmm1+xmm5 );
4213 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
4214 C.store( i , j+1UL, xmm3+xmm7 );
4215 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
4220 const size_t kbegin( ( IsLower_v<MT5> )
4221 ?( ( IsUpper_v<MT4> )
4222 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4223 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4224 :( IsUpper_v<MT4> ? i : 0UL ) );
4225 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
4232 for( ; (k+2UL) <= kend; k+=2UL ) {
4233 const SIMDType b1(
set( B(k ,j) ) );
4234 const SIMDType b2(
set( B(k+1UL,j) ) );
4235 xmm1 += A.load(i ,k ) * b1;
4236 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
4237 xmm3 += A.load(i ,k+1UL) * b2;
4238 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
4241 for( ; k<kend; ++k ) {
4242 const SIMDType b1(
set( B(k,j) ) );
4243 xmm1 += A.load(i ,k) * b1;
4247 C.store( i , j, xmm1+xmm3 );
4248 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
4255 size_t j(
UPP ? i : 0UL );
4257 for( ; (j+4UL) <= jend; j+=4UL )
4259 const size_t kbegin( ( IsLower_v<MT5> )
4260 ?( ( IsUpper_v<MT4> )
4261 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4262 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4263 :( IsUpper_v<MT4> ? i : 0UL ) );
4264 const size_t kend( ( IsUpper_v<MT5> )
4265 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4275 for( ; (k+2UL) <= kend; k+=2UL ) {
4277 const SIMDType a2( A.load(i,k+1UL) );
4278 xmm1 += a1 *
set( B(k ,j ) );
4279 xmm2 += a1 *
set( B(k ,j+1UL) );
4280 xmm3 += a1 *
set( B(k ,j+2UL) );
4281 xmm4 += a1 *
set( B(k ,j+3UL) );
4282 xmm5 += a2 *
set( B(k+1UL,j ) );
4283 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
4284 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
4285 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
4288 for( ; k<kend; ++k ) {
4290 xmm1 += a1 *
set( B(k,j ) );
4291 xmm2 += a1 *
set( B(k,j+1UL) );
4292 xmm3 += a1 *
set( B(k,j+2UL) );
4293 xmm4 += a1 *
set( B(k,j+3UL) );
4296 C.store( i, j , xmm1+xmm5 );
4297 C.store( i, j+1UL, xmm2+xmm6 );
4298 C.store( i, j+2UL, xmm3+xmm7 );
4299 C.store( i, j+3UL, xmm4+xmm8 );
4302 for( ; (j+3UL) <= jend; j+=3UL )
4304 const size_t kbegin( ( IsLower_v<MT5> )
4305 ?( ( IsUpper_v<MT4> )
4306 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4307 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4308 :( IsUpper_v<MT4> ? i : 0UL ) );
4309 const size_t kend( ( IsUpper_v<MT5> )
4310 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4319 for( ; (k+2UL) <= kend; k+=2UL ) {
4321 const SIMDType a2( A.load(i,k+1UL) );
4322 xmm1 += a1 *
set( B(k ,j ) );
4323 xmm2 += a1 *
set( B(k ,j+1UL) );
4324 xmm3 += a1 *
set( B(k ,j+2UL) );
4325 xmm4 += a2 *
set( B(k+1UL,j ) );
4326 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
4327 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
4330 for( ; k<kend; ++k ) {
4332 xmm1 += a1 *
set( B(k,j ) );
4333 xmm2 += a1 *
set( B(k,j+1UL) );
4334 xmm3 += a1 *
set( B(k,j+2UL) );
4337 C.store( i, j , xmm1+xmm4 );
4338 C.store( i, j+1UL, xmm2+xmm5 );
4339 C.store( i, j+2UL, xmm3+xmm6 );
4342 for( ; (j+2UL) <= jend; j+=2UL )
4344 const size_t kbegin( ( IsLower_v<MT5> )
4345 ?( ( IsUpper_v<MT4> )
4346 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4347 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4348 :( IsUpper_v<MT4> ? i : 0UL ) );
4349 const size_t kend( ( IsUpper_v<MT5> )
4350 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4358 for( ; (k+2UL) <= kend; k+=2UL ) {
4360 const SIMDType a2( A.load(i,k+1UL) );
4361 xmm1 += a1 *
set( B(k ,j ) );
4362 xmm2 += a1 *
set( B(k ,j+1UL) );
4363 xmm3 += a2 *
set( B(k+1UL,j ) );
4364 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
4367 for( ; k<kend; ++k ) {
4369 xmm1 += a1 *
set( B(k,j ) );
4370 xmm2 += a1 *
set( B(k,j+1UL) );
4373 C.store( i, j , xmm1+xmm3 );
4374 C.store( i, j+1UL, xmm2+xmm4 );
4379 const size_t kbegin( ( IsLower_v<MT5> )
4380 ?( ( IsUpper_v<MT4> )
4381 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4382 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4383 :( IsUpper_v<MT4> ? i : 0UL ) );
4389 for( ; (k+2UL) <= K; k+=2UL ) {
4390 xmm1 += A.load(i,k ) *
set( B(k ,j) );
4391 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
4395 xmm1 += A.load(i,k) *
set( B(k,j) );
4398 C.store( i, j, xmm1+xmm2 );
4402 for( ; remainder && i<M; ++i )
4404 const size_t jend(
LOW ? i+1UL : N );
4405 size_t j(
UPP ? i : 0UL );
4407 for( ; (j+2UL) <= jend; j+=2UL )
4409 const size_t kbegin( ( IsLower_v<MT5> )
4410 ?( ( IsUpper_v<MT4> )
4411 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4412 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4413 :( IsUpper_v<MT4> ? i : 0UL ) );
4414 const size_t kend( ( IsUpper_v<MT5> )
4415 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4421 for(
size_t k=kbegin; k<kend; ++k ) {
4422 value1 += A(i,k) * B(k,j );
4423 value2 += A(i,k) * B(k,j+1UL);
4427 C(i,j+1UL) = value2;
4432 const size_t kbegin( ( IsLower_v<MT5> )
4433 ?( ( IsUpper_v<MT4> )
4434 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4435 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4436 :( IsUpper_v<MT4> ? i : 0UL ) );
4440 for(
size_t k=kbegin; k<K; ++k ) {
4441 value += A(i,k) * B(k,j);
4465 template<
typename MT3
4468 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4469 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4471 selectDefaultAddAssignKernel( C, A, B );
4491 template<
typename MT3
4494 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4495 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4521 template<
typename MT3
4524 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4525 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4527 selectLargeAddAssignKernel( C, A, B );
4533 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4547 template<
typename MT3
4550 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4551 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4553 using ET = ElementType_t<MT3>;
4555 if( IsTriangular_v<MT4> ) {
4556 ResultType_t<MT3> tmp(
serial( B ) );
4557 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4558 addAssign( C, tmp );
4560 else if( IsTriangular_v<MT5> ) {
4561 ResultType_t<MT3> tmp(
serial( A ) );
4562 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4563 addAssign( C, tmp );
4566 gemm( C, A, B, ET(1), ET(1) );
4590 template<
typename MT
4592 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
4599 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4613 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4629 template<
typename MT3
4632 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4634 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
4635 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
4636 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
4637 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4638 selectSmallSubAssignKernel( C, A, B );
4640 selectBlasSubAssignKernel( C, A, B );
4659 template<
typename MT3
4662 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4663 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4665 const size_t M( A.rows() );
4666 const size_t N( B.columns() );
4667 const size_t K( A.columns() );
4671 for(
size_t i=0UL; i<M; ++i )
4673 const size_t kbegin( ( IsUpper_v<MT4> )
4674 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4676 const size_t kend( ( IsLower_v<MT4> )
4677 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4681 for(
size_t k=kbegin; k<kend; ++k )
4683 const size_t jbegin( ( IsUpper_v<MT5> )
4684 ?( ( IsStrictlyUpper_v<MT5> )
4685 ?(
UPP ?
max(i,k+1UL) : k+1UL )
4686 :(
UPP ?
max(i,k) : k ) )
4687 :(
UPP ? i : 0UL ) );
4688 const size_t jend( ( IsLower_v<MT5> )
4689 ?( ( IsStrictlyLower_v<MT5> )
4690 ?(
LOW ?
min(i+1UL,k) : k )
4691 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
4692 :(
LOW ? i+1UL : N ) );
4694 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
4697 const size_t jnum( jend - jbegin );
4698 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4700 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4701 C(i,j ) -= A(i,k) * B(k,j );
4702 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4705 C(i,jpos) -= A(i,k) * B(k,jpos);
4727 template<
typename MT3
4730 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4731 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4733 const size_t M( A.rows() );
4734 const size_t N( B.columns() );
4735 const size_t K( A.columns() );
4739 for(
size_t j=0UL; j<N; ++j )
4741 const size_t kbegin( ( IsLower_v<MT5> )
4742 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4744 const size_t kend( ( IsUpper_v<MT5> )
4745 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4749 for(
size_t k=kbegin; k<kend; ++k )
4751 const size_t ibegin( ( IsLower_v<MT4> )
4752 ?( ( IsStrictlyLower_v<MT4> )
4753 ?(
LOW ?
max(j,k+1UL) : k+1UL )
4754 :(
LOW ?
max(j,k) : k ) )
4755 :(
LOW ? j : 0UL ) );
4756 const size_t iend( ( IsUpper_v<MT4> )
4757 ?( ( IsStrictlyUpper_v<MT4> )
4758 ?(
UPP ?
min(j+1UL,k) : k )
4759 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
4760 :(
UPP ? j+1UL : M ) );
4762 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
4765 const size_t inum( iend - ibegin );
4766 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4768 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4769 C(i ,j) -= A(i ,k) * B(k,j);
4770 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4773 C(ipos,j) -= A(ipos,k) * B(k,j);
4795 template<
typename MT3
4798 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4799 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4801 constexpr
size_t block( BLOCK_SIZE );
4803 const size_t M( A.rows() );
4804 const size_t N( B.columns() );
4806 for(
size_t ii=0UL; ii<M; ii+=block ) {
4807 const size_t iend(
min( M, ii+block ) );
4808 for(
size_t jj=0UL; jj<N; jj+=block ) {
4809 const size_t jend(
min( N, jj+block ) );
4810 for(
size_t i=ii; i<iend; ++i )
4812 const size_t jbegin( ( IsUpper_v<MT4> )
4813 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
4815 const size_t jpos( ( IsLower_v<MT4> )
4816 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
4819 for(
size_t j=jbegin; j<jpos; ++j ) {
4820 C(i,j) -= A(i,j) * B(j,j);
4843 template<
typename MT3
4846 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4847 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4849 const size_t M( A.rows() );
4850 const size_t N( B.columns() );
4852 for(
size_t j=0UL; j<N; ++j )
4854 const size_t ibegin( ( IsLower_v<MT4> )
4855 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
4857 const size_t iend( ( IsUpper_v<MT4> )
4858 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
4862 const size_t inum( iend - ibegin );
4863 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4865 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4866 C(i ,j) -= A(i ,j) * B(j,j);
4867 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
4870 C(ipos,j) -= A(ipos,j) * B(j,j);
4891 template<
typename MT3
4894 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4895 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4897 const size_t M( A.rows() );
4898 const size_t N( B.columns() );
4900 for(
size_t i=0UL; i<M; ++i )
4902 const size_t jbegin( ( IsUpper_v<MT5> )
4903 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
4905 const size_t jend( ( IsLower_v<MT5> )
4906 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
4910 const size_t jnum( jend - jbegin );
4911 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4913 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4914 C(i,j ) -= A(i,i) * B(i,j );
4915 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
4918 C(i,jpos) -= A(i,i) * B(i,jpos);
4939 template<
typename MT3
4942 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4943 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4945 constexpr
size_t block( BLOCK_SIZE );
4947 const size_t M( A.rows() );
4948 const size_t N( B.columns() );
4950 for(
size_t jj=0UL; jj<N; jj+=block ) {
4951 const size_t jend(
min( N, jj+block ) );
4952 for(
size_t ii=0UL; ii<M; ii+=block ) {
4953 const size_t iend(
min( M, ii+block ) );
4954 for(
size_t j=jj; j<jend; ++j )
4956 const size_t ibegin( ( IsLower_v<MT5> )
4957 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
4959 const size_t ipos( ( IsUpper_v<MT5> )
4960 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
4963 for(
size_t i=ibegin; i<ipos; ++i ) {
4964 C(i,j) -= A(i,i) * B(i,j);
4987 template<
typename MT3
4990 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4991 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4993 for(
size_t i=0UL; i<A.rows(); ++i ) {
4994 C(i,i) -= A(i,i) * B(i,i);
5014 template<
typename MT3
5017 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5018 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5020 selectDefaultSubAssignKernel( C, A, B );
5040 template<
typename MT3
5043 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5044 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5046 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5048 const size_t M( A.rows() );
5049 const size_t N( B.columns() );
5050 const size_t K( A.columns() );
5054 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
5059 if( IsIntegral_v<ElementType> )
5062 for(
size_t i=0UL; i<M; ++i )
5064 const size_t kbegin( ( IsUpper_v<MT4> )
5065 ?( ( IsLower_v<MT5> )
5066 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5067 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5068 :( IsLower_v<MT5> ? j : 0UL ) );
5069 const size_t kend( ( IsLower_v<MT4> )
5070 ?( ( IsUpper_v<MT5> )
5071 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5072 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5073 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5084 for(
size_t k=kbegin; k<kend; ++k ) {
5085 const SIMDType a1(
set( A(i,k) ) );
5086 xmm1 -= a1 * B.load(k,j );
5087 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5088 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5089 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5090 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5091 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
5092 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
5093 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
5096 C.store( i, j , xmm1 );
5098 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5099 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5100 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5101 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
5102 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
5103 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
5112 for( ; (i+2UL) <= M; i+=2UL )
5114 const size_t kbegin( ( IsUpper_v<MT4> )
5115 ?( ( IsLower_v<MT5> )
5116 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5117 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5118 :( IsLower_v<MT5> ? j : 0UL ) );
5119 const size_t kend( ( IsLower_v<MT4> )
5120 ?( ( IsUpper_v<MT5> )
5121 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5122 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5123 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5130 SIMDType xmm6 ( C.load(i+1UL,j ) );
5136 for(
size_t k=kbegin; k<kend; ++k ) {
5137 const SIMDType a1(
set( A(i ,k) ) );
5138 const SIMDType a2(
set( A(i+1UL,k) ) );
5156 C.store( i , j , xmm1 );
5158 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5159 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5160 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
5161 C.store( i+1UL, j , xmm6 );
5162 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
5163 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
5164 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
5165 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
5170 const size_t kbegin( ( IsUpper_v<MT4> )
5171 ?( ( IsLower_v<MT5> )
5172 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5173 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5174 :( IsLower_v<MT5> ? j : 0UL ) );
5175 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5183 for(
size_t k=kbegin; k<kend; ++k ) {
5184 const SIMDType a1(
set( A(i,k) ) );
5185 xmm1 -= a1 * B.load(k,j );
5186 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5187 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5188 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5189 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5192 C.store( i, j , xmm1 );
5194 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5195 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5196 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5204 for( ; (i+2UL) <= M; i+=2UL )
5206 const size_t kbegin( ( IsUpper_v<MT4> )
5207 ?( ( IsLower_v<MT5> )
5208 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5209 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5210 :( IsLower_v<MT5> ? j : 0UL ) );
5211 const size_t kend( ( IsLower_v<MT4> )
5212 ?( ( IsUpper_v<MT5> )
5213 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5214 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5215 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5226 for(
size_t k=kbegin; k<kend; ++k ) {
5227 const SIMDType a1(
set( A(i ,k) ) );
5228 const SIMDType a2(
set( A(i+1UL,k) ) );
5243 C.store( i , j , xmm1 );
5245 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5246 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5247 C.store( i+1UL, j , xmm5 );
5248 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
5249 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
5250 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
5255 const size_t kbegin( ( IsUpper_v<MT4> )
5256 ?( ( IsLower_v<MT5> )
5257 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5258 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5259 :( IsLower_v<MT5> ? j : 0UL ) );
5260 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5267 for(
size_t k=kbegin; k<kend; ++k ) {
5268 const SIMDType a1(
set( A(i,k) ) );
5269 xmm1 -= a1 * B.load(k,j );
5270 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5271 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5272 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5275 C.store( i, j , xmm1 );
5277 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5278 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5286 for( ; (i+2UL) <= M; i+=2UL )
5288 const size_t kbegin( ( IsUpper_v<MT4> )
5289 ?( ( IsLower_v<MT5> )
5290 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5291 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5292 :( IsLower_v<MT5> ? j : 0UL ) );
5293 const size_t kend( ( IsLower_v<MT4> )
5294 ?( ( IsUpper_v<MT5> )
5295 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5296 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5297 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5306 for(
size_t k=kbegin; k<kend; ++k ) {
5307 const SIMDType a1(
set( A(i ,k) ) );
5308 const SIMDType a2(
set( A(i+1UL,k) ) );
5320 C.store( i , j , xmm1 );
5322 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5323 C.store( i+1UL, j , xmm4 );
5324 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
5325 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
5330 const size_t kbegin( ( IsUpper_v<MT4> )
5331 ?( ( IsLower_v<MT5> )
5332 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5333 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5334 :( IsLower_v<MT5> ? j : 0UL ) );
5335 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5341 for(
size_t k=kbegin; k<kend; ++k ) {
5342 const SIMDType a1(
set( A(i,k) ) );
5343 xmm1 -= a1 * B.load(k,j );
5344 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5345 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5348 C.store( i, j , xmm1 );
5350 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5357 size_t i(
LOW ? j : 0UL );
5359 for( ; (i+4UL) <= iend; i+=4UL )
5361 const size_t kbegin( ( IsUpper_v<MT4> )
5362 ?( ( IsLower_v<MT5> )
5363 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5364 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5365 :( IsLower_v<MT5> ? j : 0UL ) );
5366 const size_t kend( ( IsLower_v<MT4> )
5367 ?( ( IsUpper_v<MT5> )
5368 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
5369 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5370 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5381 for(
size_t k=kbegin; k<kend; ++k ) {
5382 const SIMDType a1(
set( A(i ,k) ) );
5383 const SIMDType a2(
set( A(i+1UL,k) ) );
5384 const SIMDType a3(
set( A(i+2UL,k) ) );
5385 const SIMDType a4(
set( A(i+3UL,k) ) );
5398 C.store( i , j , xmm1 );
5400 C.store( i+1UL, j , xmm3 );
5401 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5402 C.store( i+2UL, j , xmm5 );
5403 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5404 C.store( i+3UL, j , xmm7 );
5405 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
5408 for( ; (i+3UL) <= iend; i+=3UL )
5410 const size_t kbegin( ( IsUpper_v<MT4> )
5411 ?( ( IsLower_v<MT5> )
5412 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5413 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5414 :( IsLower_v<MT5> ? j : 0UL ) );
5415 const size_t kend( ( IsLower_v<MT4> )
5416 ?( ( IsUpper_v<MT5> )
5417 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
5418 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5419 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5428 for(
size_t k=kbegin; k<kend; ++k ) {
5429 const SIMDType a1(
set( A(i ,k) ) );
5430 const SIMDType a2(
set( A(i+1UL,k) ) );
5431 const SIMDType a3(
set( A(i+2UL,k) ) );
5442 C.store( i , j , xmm1 );
5444 C.store( i+1UL, j , xmm3 );
5445 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5446 C.store( i+2UL, j , xmm5 );
5447 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5450 for( ; (i+2UL) <= iend; i+=2UL )
5452 const size_t kbegin( ( IsUpper_v<MT4> )
5453 ?( ( IsLower_v<MT5> )
5454 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5455 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5456 :( IsLower_v<MT5> ? j : 0UL ) );
5457 const size_t kend( ( IsLower_v<MT4> )
5458 ?( ( IsUpper_v<MT5> )
5459 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
5460 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5461 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5470 for( ; (k+2UL) <= kend; k+=2UL ) {
5471 const SIMDType a1(
set( A(i ,k ) ) );
5472 const SIMDType a2(
set( A(i+1UL,k ) ) );
5473 const SIMDType a3(
set( A(i ,k+1UL) ) );
5474 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5475 const SIMDType b1( B.load(k ,j ) );
5477 const SIMDType b3( B.load(k+1UL,j ) );
5489 for( ; k<kend; ++k ) {
5490 const SIMDType a1(
set( A(i ,k) ) );
5491 const SIMDType a2(
set( A(i+1UL,k) ) );
5500 C.store( i , j , xmm1+xmm5 );
5501 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
5502 C.store( i+1UL, j , xmm3+xmm7 );
5503 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
5508 const size_t kbegin( ( IsUpper_v<MT4> )
5509 ?( ( IsLower_v<MT5> )
5510 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5511 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5512 :( IsLower_v<MT5> ? j : 0UL ) );
5513 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
5520 for( ; (k+2UL) <= kend; k+=2UL ) {
5521 const SIMDType a1(
set( A(i,k ) ) );
5522 const SIMDType a2(
set( A(i,k+1UL) ) );
5523 xmm1 -= a1 * B.load(k ,j );
5524 xmm2 -= a1 * B.load(k ,j+
SIMDSIZE);
5525 xmm3 -= a2 * B.load(k+1UL,j );
5526 xmm4 -= a2 * B.load(k+1UL,j+
SIMDSIZE);
5529 for( ; k<kend; ++k ) {
5530 const SIMDType a1(
set( A(i,k) ) );
5531 xmm1 -= a1 * B.load(k,j );
5535 C.store( i, j , xmm1+xmm3 );
5536 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
5543 size_t i(
LOW ? j : 0UL );
5545 for( ; (i+4UL) <= iend; i+=4UL )
5547 const size_t kbegin( ( IsUpper_v<MT4> )
5548 ?( ( IsLower_v<MT5> )
5549 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5550 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5551 :( IsLower_v<MT5> ? j : 0UL ) );
5552 const size_t kend( ( IsLower_v<MT4> )
5553 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5563 for( ; (k+2UL) <= kend; k+=2UL ) {
5565 const SIMDType b2( B.load(k+1UL,j) );
5566 xmm1 -=
set( A(i ,k ) ) * b1;
5567 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5568 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5569 xmm4 -=
set( A(i+3UL,k ) ) * b1;
5570 xmm5 -=
set( A(i ,k+1UL) ) * b2;
5571 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
5572 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
5573 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
5576 for( ; k<kend; ++k ) {
5578 xmm1 -=
set( A(i ,k) ) * b1;
5579 xmm2 -=
set( A(i+1UL,k) ) * b1;
5580 xmm3 -=
set( A(i+2UL,k) ) * b1;
5581 xmm4 -=
set( A(i+3UL,k) ) * b1;
5584 C.store( i , j, xmm1+xmm5 );
5585 C.store( i+1UL, j, xmm2+xmm6 );
5586 C.store( i+2UL, j, xmm3+xmm7 );
5587 C.store( i+3UL, j, xmm4+xmm8 );
5590 for( ; (i+3UL) <= iend; i+=3UL )
5592 const size_t kbegin( ( IsUpper_v<MT4> )
5593 ?( ( IsLower_v<MT5> )
5594 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5595 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5596 :( IsLower_v<MT5> ? j : 0UL ) );
5597 const size_t kend( ( IsLower_v<MT4> )
5598 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5607 for( ; (k+2UL) <= kend; k+=2UL ) {
5609 const SIMDType b2( B.load(k+1UL,j) );
5610 xmm1 -=
set( A(i ,k ) ) * b1;
5611 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5612 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5613 xmm4 -=
set( A(i ,k+1UL) ) * b2;
5614 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
5615 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
5618 for( ; k<kend; ++k ) {
5620 xmm1 -=
set( A(i ,k) ) * b1;
5621 xmm2 -=
set( A(i+1UL,k) ) * b1;
5622 xmm3 -=
set( A(i+2UL,k) ) * b1;
5625 C.store( i , j, xmm1+xmm4 );
5626 C.store( i+1UL, j, xmm2+xmm5 );
5627 C.store( i+2UL, j, xmm3+xmm6 );
5630 for( ; (i+2UL) <= iend; i+=2UL )
5632 const size_t kbegin( ( IsUpper_v<MT4> )
5633 ?( ( IsLower_v<MT5> )
5634 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5635 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5636 :( IsLower_v<MT5> ? j : 0UL ) );
5637 const size_t kend( ( IsLower_v<MT4> )
5638 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5646 for( ; (k+2UL) <= kend; k+=2UL ) {
5648 const SIMDType b2( B.load(k+1UL,j) );
5649 xmm1 -=
set( A(i ,k ) ) * b1;
5650 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5651 xmm3 -=
set( A(i ,k+1UL) ) * b2;
5652 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
5655 for( ; k<kend; ++k ) {
5657 xmm1 -=
set( A(i ,k) ) * b1;
5658 xmm2 -=
set( A(i+1UL,k) ) * b1;
5661 C.store( i , j, xmm1+xmm3 );
5662 C.store( i+1UL, j, xmm2+xmm4 );
5667 const size_t kbegin( ( IsUpper_v<MT4> )
5668 ?( ( IsLower_v<MT5> )
5669 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5670 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5671 :( IsLower_v<MT5> ? j : 0UL ) );
5677 for( ; (k+2UL) <= K; k+=2UL ) {
5678 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
5679 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5683 xmm1 -=
set( A(i,k) ) * B.load(k,j);
5686 C.store( i, j, xmm1+xmm2 );
5690 for( ; remainder && j<N; ++j )
5692 const size_t iend(
UPP ? j+1UL : M );
5693 size_t i(
LOW ? j : 0UL );
5695 for( ; (i+2UL) <= iend; i+=2UL )
5697 const size_t kbegin( ( IsUpper_v<MT4> )
5698 ?( ( IsLower_v<MT5> )
5699 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5700 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5701 :( IsLower_v<MT5> ? j : 0UL ) );
5702 const size_t kend( ( IsLower_v<MT4> )
5703 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5709 for(
size_t k=kbegin; k<kend; ++k ) {
5710 value1 -= A(i ,k) * B(k,j);
5711 value2 -= A(i+1UL,k) * B(k,j);
5715 C(i+1UL,j) = value2;
5720 const size_t kbegin( ( IsUpper_v<MT4> )
5721 ?( ( IsLower_v<MT5> )
5722 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5723 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5724 :( IsLower_v<MT5> ? j : 0UL ) );
5728 for(
size_t k=kbegin; k<K; ++k ) {
5729 value -= A(i,k) * B(k,j);
5754 template<
typename MT3
5757 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5758 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5760 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5762 const size_t M( A.rows() );
5763 const size_t N( B.columns() );
5764 const size_t K( A.columns() );
5768 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
5773 if( IsIntegral_v<ElementType> )
5776 for(
size_t j=0UL; j<N; ++j )
5778 const size_t kbegin( ( IsLower_v<MT5> )
5779 ?( ( IsUpper_v<MT4> )
5780 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5781 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5782 :( IsUpper_v<MT4> ? i : 0UL ) );
5783 const size_t kend( ( IsUpper_v<MT5> )
5784 ?( ( IsLower_v<MT4> )
5785 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5786 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5787 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
5798 for(
size_t k=kbegin; k<kend; ++k ) {
5799 const SIMDType b1(
set( B(k,j) ) );
5800 xmm1 -= A.load(i ,k) * b1;
5801 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
5802 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
5803 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
5804 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
5805 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
5806 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
5807 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
5810 C.store( i , j, xmm1 );
5812 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
5813 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
5814 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
5815 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
5816 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
5817 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
5826 for( ; (j+2UL) <= N; j+=2UL )
5828 const size_t kbegin( ( IsLower_v<MT5> )
5829 ?( ( IsUpper_v<MT4> )
5830 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5831 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5832 :( IsUpper_v<MT4> ? i : 0UL ) );
5833 const size_t kend( ( IsUpper_v<MT5> )
5834 ?( ( IsLower_v<MT4> )
5835 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5836 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5837 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
5844 SIMDType xmm6 ( C.load(i ,j+1UL) );
5850 for(
size_t k=kbegin; k<kend; ++k ) {
5856 const SIMDType b1(
set( B(k,j ) ) );
5857 const SIMDType b2(
set( B(k,j+1UL) ) );
5870 C.store( i , j , xmm1 );
5872 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
5873 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
5874 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
5875 C.store( i , j+1UL, xmm6 );
5876 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
5877 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
5878 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
5879 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
5884 const size_t kbegin( ( IsLower_v<MT5> )
5885 ?( ( IsUpper_v<MT4> )
5886 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5887 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5888 :( IsUpper_v<MT4> ? i : 0UL ) );
5889 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
5897 for(
size_t k=kbegin; k<kend; ++k ) {
5898 const SIMDType b1(
set( B(k,j) ) );
5899 xmm1 -= A.load(i ,k) * b1;
5900 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
5901 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
5902 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
5903 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
5906 C.store( i , j, xmm1 );
5908 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
5909 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
5910 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
5918 for( ; (j+2UL) <= N; j+=2UL )
5920 const size_t kbegin( ( IsLower_v<MT5> )
5921 ?( ( IsUpper_v<MT4> )
5922 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5923 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5924 :( IsUpper_v<MT4> ? i : 0UL ) );
5925 const size_t kend( ( IsUpper_v<MT5> )
5926 ?( ( IsLower_v<MT4> )
5927 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5928 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5929 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
5940 for(
size_t k=kbegin; k<kend; ++k ) {
5945 const SIMDType b1(
set( B(k,j ) ) );
5946 const SIMDType b2(
set( B(k,j+1UL) ) );
5957 C.store( i , j , xmm1 );
5959 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
5960 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
5961 C.store( i , j+1UL, xmm5 );
5962 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
5963 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
5964 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
5969 const size_t kbegin( ( IsLower_v<MT5> )
5970 ?( ( IsUpper_v<MT4> )
5971 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5972 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5973 :( IsUpper_v<MT4> ? i : 0UL ) );
5974 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
5981 for(
size_t k=kbegin; k<kend; ++k ) {
5982 const SIMDType b1(
set( B(k,j) ) );
5983 xmm1 -= A.load(i ,k) * b1;
5984 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
5985 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
5986 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
5989 C.store( i , j, xmm1 );
5991 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
5992 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6000 for( ; (j+2UL) <= N; j+=2UL )
6002 const size_t kbegin( ( IsLower_v<MT5> )
6003 ?( ( IsUpper_v<MT4> )
6004 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6005 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6006 :( IsUpper_v<MT4> ? i : 0UL ) );
6007 const size_t kend( ( IsUpper_v<MT5> )
6008 ?( ( IsLower_v<MT4> )
6009 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6010 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6011 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
6020 for(
size_t k=kbegin; k<kend; ++k ) {
6024 const SIMDType b1(
set( B(k,j ) ) );
6025 const SIMDType b2(
set( B(k,j+1UL) ) );
6034 C.store( i , j , xmm1 );
6036 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6037 C.store( i , j+1UL, xmm4 );
6038 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
6039 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
6044 const size_t kbegin( ( IsLower_v<MT5> )
6045 ?( ( IsUpper_v<MT4> )
6046 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6047 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6048 :( IsUpper_v<MT4> ? i : 0UL ) );
6049 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
6055 for(
size_t k=kbegin; k<kend; ++k ) {
6056 const SIMDType b1(
set( B(k,j) ) );
6057 xmm1 -= A.load(i ,k) * b1;
6058 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6059 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6062 C.store( i , j, xmm1 );
6064 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6071 size_t j(
UPP ? i : 0UL );
6073 for( ; (j+4UL) <= jend; j+=4UL )
6075 const size_t kbegin( ( IsLower_v<MT5> )
6076 ?( ( IsUpper_v<MT4> )
6077 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6078 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6079 :( IsUpper_v<MT4> ? i : 0UL ) );
6080 const size_t kend( ( IsUpper_v<MT5> )
6081 ?( ( IsLower_v<MT4> )
6082 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6083 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6084 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6095 for(
size_t k=kbegin; k<kend; ++k ) {
6098 const SIMDType b1(
set( B(k,j ) ) );
6099 const SIMDType b2(
set( B(k,j+1UL) ) );
6100 const SIMDType b3(
set( B(k,j+2UL) ) );
6101 const SIMDType b4(
set( B(k,j+3UL) ) );
6112 C.store( i , j , xmm1 );
6114 C.store( i , j+1UL, xmm3 );
6115 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6116 C.store( i , j+2UL, xmm5 );
6117 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6118 C.store( i , j+3UL, xmm7 );
6119 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
6122 for( ; (j+3UL) <= jend; j+=3UL )
6124 const size_t kbegin( ( IsLower_v<MT5> )
6125 ?( ( IsUpper_v<MT4> )
6126 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6127 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6128 :( IsUpper_v<MT4> ? i : 0UL ) );
6129 const size_t kend( ( IsUpper_v<MT5> )
6130 ?( ( IsLower_v<MT4> )
6131 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6132 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6133 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6142 for(
size_t k=kbegin; k<kend; ++k ) {
6145 const SIMDType b1(
set( B(k,j ) ) );
6146 const SIMDType b2(
set( B(k,j+1UL) ) );
6147 const SIMDType b3(
set( B(k,j+2UL) ) );
6156 C.store( i , j , xmm1 );
6158 C.store( i , j+1UL, xmm3 );
6159 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6160 C.store( i , j+2UL, xmm5 );
6161 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6164 for( ; (j+2UL) <= jend; j+=2UL )
6166 const size_t kbegin( ( IsLower_v<MT5> )
6167 ?( ( IsUpper_v<MT4> )
6168 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6169 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6170 :( IsUpper_v<MT4> ? i : 0UL ) );
6171 const size_t kend( ( IsUpper_v<MT5> )
6172 ?( ( IsLower_v<MT4> )
6173 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6174 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6175 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6184 for( ; (k+2UL) <= kend; k+=2UL ) {
6185 const SIMDType a1( A.load(i ,k ) );
6187 const SIMDType a3( A.load(i ,k+1UL) );
6189 const SIMDType b1(
set( B(k ,j ) ) );
6190 const SIMDType b2(
set( B(k ,j+1UL) ) );
6191 const SIMDType b3(
set( B(k+1UL,j ) ) );
6192 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6203 for( ; k<kend; ++k ) {
6206 const SIMDType b1(
set( B(k,j ) ) );
6207 const SIMDType b2(
set( B(k,j+1UL) ) );
6214 C.store( i , j , xmm1+xmm5 );
6215 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
6216 C.store( i , j+1UL, xmm3+xmm7 );
6217 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
6222 const size_t kbegin( ( IsLower_v<MT5> )
6223 ?( ( IsUpper_v<MT4> )
6224 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6225 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6226 :( IsUpper_v<MT4> ? i : 0UL ) );
6227 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
6234 for( ; (k+2UL) <= kend; k+=2UL ) {
6235 const SIMDType b1(
set( B(k ,j) ) );
6236 const SIMDType b2(
set( B(k+1UL,j) ) );
6237 xmm1 -= A.load(i ,k ) * b1;
6238 xmm2 -= A.load(i+
SIMDSIZE,k ) * b1;
6239 xmm3 -= A.load(i ,k+1UL) * b2;
6240 xmm4 -= A.load(i+
SIMDSIZE,k+1UL) * b2;
6243 for( ; k<kend; ++k ) {
6244 const SIMDType b1(
set( B(k,j) ) );
6245 xmm1 -= A.load(i ,k) * b1;
6249 C.store( i , j, xmm1+xmm3 );
6250 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
6257 size_t j(
UPP ? i : 0UL );
6259 for( ; (j+4UL) <= jend; j+=4UL )
6261 const size_t kbegin( ( IsLower_v<MT5> )
6262 ?( ( IsUpper_v<MT4> )
6263 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6264 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6265 :( IsUpper_v<MT4> ? i : 0UL ) );
6266 const size_t kend( ( IsUpper_v<MT5> )
6267 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6277 for( ; (k+2UL) <= kend; k+=2UL ) {
6279 const SIMDType a2( A.load(i,k+1UL) );
6280 xmm1 -= a1 *
set( B(k ,j ) );
6281 xmm2 -= a1 *
set( B(k ,j+1UL) );
6282 xmm3 -= a1 *
set( B(k ,j+2UL) );
6283 xmm4 -= a1 *
set( B(k ,j+3UL) );
6284 xmm5 -= a2 *
set( B(k+1UL,j ) );
6285 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
6286 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
6287 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
6290 for( ; k<kend; ++k ) {
6292 xmm1 -= a1 *
set( B(k,j ) );
6293 xmm2 -= a1 *
set( B(k,j+1UL) );
6294 xmm3 -= a1 *
set( B(k,j+2UL) );
6295 xmm4 -= a1 *
set( B(k,j+3UL) );
6298 C.store( i, j , xmm1+xmm5 );
6299 C.store( i, j+1UL, xmm2+xmm6 );
6300 C.store( i, j+2UL, xmm3+xmm7 );
6301 C.store( i, j+3UL, xmm4+xmm8 );
6304 for( ; (j+3UL) <= jend; j+=3UL )
6306 const size_t kbegin( ( IsLower_v<MT5> )
6307 ?( ( IsUpper_v<MT4> )
6308 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6309 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6310 :( IsUpper_v<MT4> ? i : 0UL ) );
6311 const size_t kend( ( IsUpper_v<MT5> )
6312 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6321 for( ; (k+2UL) <= kend; k+=2UL ) {
6323 const SIMDType a2( A.load(i,k+1UL) );
6324 xmm1 -= a1 *
set( B(k ,j ) );
6325 xmm2 -= a1 *
set( B(k ,j+1UL) );
6326 xmm3 -= a1 *
set( B(k ,j+2UL) );
6327 xmm4 -= a2 *
set( B(k+1UL,j ) );
6328 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
6329 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
6332 for( ; k<kend; ++k ) {
6334 xmm1 -= a1 *
set( B(k,j ) );
6335 xmm2 -= a1 *
set( B(k,j+1UL) );
6336 xmm3 -= a1 *
set( B(k,j+2UL) );
6339 C.store( i, j , xmm1+xmm4 );
6340 C.store( i, j+1UL, xmm2+xmm5 );
6341 C.store( i, j+2UL, xmm3+xmm6 );
6344 for( ; (j+2UL) <= jend; j+=2UL )
6346 const size_t kbegin( ( IsLower_v<MT5> )
6347 ?( ( IsUpper_v<MT4> )
6348 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6349 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6350 :( IsUpper_v<MT4> ? i : 0UL ) );
6351 const size_t kend( ( IsUpper_v<MT5> )
6352 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6360 for( ; (k+2UL) <= kend; k+=2UL ) {
6362 const SIMDType a2( A.load(i,k+1UL) );
6363 xmm1 -= a1 *
set( B(k ,j ) );
6364 xmm2 -= a1 *
set( B(k ,j+1UL) );
6365 xmm3 -= a2 *
set( B(k+1UL,j ) );
6366 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
6369 for( ; k<kend; ++k ) {
6371 xmm1 -= a1 *
set( B(k,j ) );
6372 xmm2 -= a1 *
set( B(k,j+1UL) );
6375 C.store( i, j , xmm1+xmm3 );
6376 C.store( i, j+1UL, xmm2+xmm4 );
6381 const size_t kbegin( ( IsLower_v<MT5> )
6382 ?( ( IsUpper_v<MT4> )
6383 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6384 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6385 :( IsUpper_v<MT4> ? i : 0UL ) );
6391 for( ; (k+2UL) <= K; k+=2UL ) {
6392 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
6393 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
6397 xmm1 -= A.load(i,k) *
set( B(k,j) );
6400 C.store( i, j, xmm1+xmm2 );
6404 for( ; remainder && i<M; ++i )
6406 const size_t jend(
LOW ? i+1UL : N );
6407 size_t j(
UPP ? i : 0UL );
6409 for( ; (j+2UL) <= jend; j+=2UL )
6411 const size_t kbegin( ( IsLower_v<MT5> )
6412 ?( ( IsUpper_v<MT4> )
6413 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6414 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6415 :( IsUpper_v<MT4> ? i : 0UL ) );
6416 const size_t kend( ( IsUpper_v<MT5> )
6417 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6423 for(
size_t k=kbegin; k<kend; ++k ) {
6424 value1 -= A(i,k) * B(k,j );
6425 value2 -= A(i,k) * B(k,j+1UL);
6429 C(i,j+1UL) = value2;
6434 const size_t kbegin( ( IsLower_v<MT5> )
6435 ?( ( IsUpper_v<MT4> )
6436 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6437 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6438 :( IsUpper_v<MT4> ? i : 0UL ) );
6442 for(
size_t k=kbegin; k<K; ++k ) {
6443 value -= A(i,k) * B(k,j);
6467 template<
typename MT3
6470 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6471 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6473 selectDefaultSubAssignKernel( C, A, B );
6493 template<
typename MT3
6496 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6497 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6523 template<
typename MT3
6526 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6527 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6529 selectLargeSubAssignKernel( C, A, B );
6535 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6549 template<
typename MT3
6552 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6553 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6555 using ET = ElementType_t<MT3>;
6557 if( IsTriangular_v<MT4> ) {
6558 ResultType_t<MT3> tmp(
serial( B ) );
6559 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6560 subAssign( C, tmp );
6562 else if( IsTriangular_v<MT5> ) {
6563 ResultType_t<MT3> tmp(
serial( A ) );
6564 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6565 subAssign( C, tmp );
6568 gemm( C, A, B, ET(-1), ET(1) );
6592 template<
typename MT
6594 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
6606 schurAssign( ~lhs, tmp );
6639 template<
typename MT
6642 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6649 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6652 else if( rhs.lhs_.columns() == 0UL ) {
6688 template<
typename MT
6691 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6695 using TmpType = If_t< SO, ResultType, OppositeType >;
6707 const ForwardFunctor fwd;
6709 const TmpType tmp( rhs );
6731 template<
typename MT
6734 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6741 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6780 template<
typename MT
6783 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6790 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6826 template<
typename MT
6886 template<
typename MT1
6893 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
6894 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
6895 ,
private Computation
6900 using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6902 using RES = ResultType_t<MMM>;
6903 using RT1 = ResultType_t<MT1>;
6904 using RT2 = ResultType_t<MT2>;
6905 using ET1 = ElementType_t<RT1>;
6906 using ET2 = ElementType_t<RT2>;
6907 using CT1 = CompositeType_t<MT1>;
6908 using CT2 = CompositeType_t<MT2>;
6913 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
6918 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
6922 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
6923 static constexpr
bool HERM = ( HF && !( LF || UF ) );
6924 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
6925 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
6933 template<
typename T1,
typename T2,
typename T3 >
6934 static constexpr
bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
6941 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6942 static constexpr
bool UseBlasKernel_v =
6944 !SYM && !HERM && !LOW && !UPP &&
6945 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
6946 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
6947 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
6948 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6949 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6950 IsBLASCompatible_v< ElementType_t<T1> > &&
6951 IsBLASCompatible_v< ElementType_t<T2> > &&
6952 IsBLASCompatible_v< ElementType_t<T3> > &&
6953 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
6954 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
6955 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
6962 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6963 static constexpr
bool UseVectorizedDefaultKernel_v =
6964 ( useOptimizedKernels &&
6965 !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
6966 !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
6967 !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
6968 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6973 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
6981 using ForwardFunctor =
If_t< HERM
6997 using This = DMatScalarMultExpr<MMM,ST,true>;
7000 using BaseType = DenseMatrix<This,true>;
7004 , DeclHermTrait< MultTrait_t<RES,ST> >
7006 , DeclSymTrait< MultTrait_t<RES,ST> >
7009 , DeclDiagTrait< MultTrait_t<RES,ST> >
7010 , DeclLowTrait< MultTrait_t<RES,ST> > >
7012 , DeclUppTrait< MultTrait_t<RES,ST> >
7013 , MultTrait<RES,ST> > > > >::Type;
7018 using SIMDType = SIMDTrait_t<ElementType>;
7023 using LeftOperand =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7029 using LT = If_t< evaluateLeft, const RT1, CT1 >;
7032 using RT = If_t< evaluateRight, const RT2, CT2 >;
7038 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7039 MT1::simdEnabled && MT2::simdEnabled &&
7040 IsSIMDCombinable_v<ET1,ET2,ST> &&
7041 HasSIMDAdd_v<ET1,ET2> &&
7042 HasSIMDMult_v<ET1,ET2> );
7092 if( j >=
matrix_.columns() ) {
7095 return (*
this)(i,j);
7104 inline size_t rows()
const {
7114 inline size_t columns()
const {
7145 template<
typename T >
7146 inline bool canAlias(
const T* alias )
const {
7147 return matrix_.canAlias( alias );
7157 template<
typename T >
7158 inline bool isAliased(
const T* alias )
const {
7159 return matrix_.isAliased( alias );
7180 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7182 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7183 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7205 template<
typename MT
7214 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7215 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7217 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7220 else if( left.columns() == 0UL ) {
7235 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7250 template<
typename MT3
7254 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7256 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7257 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
7258 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
7259 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7260 selectSmallAssignKernel( C, A, B, scalar );
7262 selectBlasAssignKernel( C, A, B, scalar );
7280 template<
typename MT3
7284 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7285 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7287 const size_t M( A.rows() );
7288 const size_t N( B.columns() );
7289 const size_t K( A.columns() );
7293 for(
size_t i=0UL; i<M; ++i )
7295 const size_t kbegin( ( IsUpper_v<MT4> )
7296 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7298 const size_t kend( ( IsLower_v<MT4> )
7299 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7303 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7304 for(
size_t j=0UL; j<N; ++j ) {
7311 const size_t jbegin( ( IsUpper_v<MT5> )
7312 ?( ( IsStrictlyUpper_v<MT5> )
7313 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
7314 :( UPP ?
max(i,kbegin) : kbegin ) )
7315 :( UPP ? i : 0UL ) );
7316 const size_t jend( ( IsLower_v<MT5> )
7317 ?( ( IsStrictlyLower_v<MT5> )
7318 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
7319 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
7320 :( LOW ? i+1UL : N ) );
7322 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7323 for(
size_t j=0UL; j<jbegin; ++j ) {
7327 else if( IsStrictlyUpper_v<MT5> ) {
7330 for(
size_t j=jbegin; j<jend; ++j ) {
7331 C(i,j) = A(i,kbegin) * B(kbegin,j);
7333 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7334 for(
size_t j=jend; j<N; ++j ) {
7338 else if( IsStrictlyLower_v<MT5> ) {
7339 reset( C(i,N-1UL) );
7343 for(
size_t k=kbegin+1UL; k<kend; ++k )
7345 const size_t jbegin( ( IsUpper_v<MT5> )
7346 ?( ( IsStrictlyUpper_v<MT5> )
7347 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
7348 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
7349 :( SYM || HERM || UPP ? i : 0UL ) );
7350 const size_t jend( ( IsLower_v<MT5> )
7351 ?( ( IsStrictlyLower_v<MT5> )
7352 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
7353 :( LOW ?
min(i+1UL,k) : k ) )
7354 :( LOW ? i+1UL : N ) );
7356 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7359 for(
size_t j=jbegin; j<jend; ++j ) {
7360 C(i,j) += A(i,k) * B(k,j);
7362 if( IsLower_v<MT5> ) {
7363 C(i,jend) = A(i,k) * B(k,jend);
7368 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7369 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7370 :( SYM || HERM || UPP ? i : 0UL ) );
7371 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7372 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7373 :( LOW ? i+1UL : N ) );
7375 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7378 for(
size_t j=jbegin; j<jend; ++j ) {
7385 for(
size_t i=1UL; i<M; ++i ) {
7386 for(
size_t j=0UL; j<i; ++j ) {
7387 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7408 template<
typename MT3
7412 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7413 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7415 const size_t M( A.rows() );
7416 const size_t N( B.columns() );
7417 const size_t K( A.columns() );
7421 for(
size_t j=0UL; j<N; ++j )
7423 const size_t kbegin( ( IsLower_v<MT5> )
7424 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7426 const size_t kend( ( IsUpper_v<MT5> )
7427 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7431 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7432 for(
size_t i=0UL; i<M; ++i ) {
7439 const size_t ibegin( ( IsLower_v<MT4> )
7440 ?( ( IsStrictlyLower_v<MT4> )
7441 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
7442 :( LOW ?
max(j,kbegin) : kbegin ) )
7443 :( LOW ? j : 0UL ) );
7444 const size_t iend( ( IsUpper_v<MT4> )
7445 ?( ( IsStrictlyUpper_v<MT4> )
7446 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
7447 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
7448 :( UPP ? j+1UL : M ) );
7450 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7451 for(
size_t i=0UL; i<ibegin; ++i ) {
7455 else if( IsStrictlyLower_v<MT4> ) {
7458 for(
size_t i=ibegin; i<iend; ++i ) {
7459 C(i,j) = A(i,kbegin) * B(kbegin,j);
7461 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7462 for(
size_t i=iend; i<M; ++i ) {
7466 else if( IsStrictlyUpper_v<MT4> ) {
7467 reset( C(M-1UL,j) );
7471 for(
size_t k=kbegin+1UL; k<kend; ++k )
7473 const size_t ibegin( ( IsLower_v<MT4> )
7474 ?( ( IsStrictlyLower_v<MT4> )
7475 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
7476 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
7477 :( SYM || HERM || LOW ? j : 0UL ) );
7478 const size_t iend( ( IsUpper_v<MT4> )
7479 ?( ( IsStrictlyUpper_v<MT4> )
7480 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
7481 :( UPP ?
min(j+1UL,k) : k ) )
7482 :( UPP ? j+1UL : M ) );
7484 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7487 for(
size_t i=ibegin; i<iend; ++i ) {
7488 C(i,j) += A(i,k) * B(k,j);
7490 if( IsUpper_v<MT4> ) {
7491 C(iend,j) = A(iend,k) * B(k,j);
7496 const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7497 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7498 :( SYM || HERM || LOW ? j : 0UL ) );
7499 const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7500 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7501 :( UPP ? j+1UL : M ) );
7503 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7506 for(
size_t i=ibegin; i<iend; ++i ) {
7513 for(
size_t j=1UL; j<N; ++j ) {
7514 for(
size_t i=0UL; i<j; ++i ) {
7515 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7536 template<
typename MT3
7540 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7541 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7543 constexpr
size_t block( BLOCK_SIZE );
7545 const size_t M( A.rows() );
7546 const size_t N( B.columns() );
7548 for(
size_t ii=0UL; ii<M; ii+=block ) {
7549 const size_t iend(
min( M, ii+block ) );
7550 for(
size_t jj=0UL; jj<N; jj+=block ) {
7551 const size_t jend(
min( N, jj+block ) );
7552 for(
size_t i=ii; i<iend; ++i )
7554 const size_t jbegin( ( IsUpper_v<MT4> )
7555 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7557 const size_t jpos( ( IsLower_v<MT4> )
7558 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7561 if( IsUpper_v<MT4> ) {
7562 for(
size_t j=jj; j<jbegin; ++j ) {
7566 for(
size_t j=jbegin; j<jpos; ++j ) {
7567 C(i,j) = A(i,j) * B(j,j) * scalar;
7569 if( IsLower_v<MT4> ) {
7570 for(
size_t j=jpos; j<jend; ++j ) {
7594 template<
typename MT3
7598 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7599 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7601 const size_t M( A.rows() );
7602 const size_t N( B.columns() );
7604 for(
size_t j=0UL; j<N; ++j )
7606 const size_t ibegin( ( IsLower_v<MT4> )
7607 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7609 const size_t iend( ( IsUpper_v<MT4> )
7610 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7614 if( IsLower_v<MT4> ) {
7615 for(
size_t i=0UL; i<ibegin; ++i ) {
7619 for(
size_t i=ibegin; i<iend; ++i ) {
7620 C(i,j) = A(i,j) * B(j,j) * scalar;
7622 if( IsUpper_v<MT4> ) {
7623 for(
size_t i=iend; i<M; ++i ) {
7645 template<
typename MT3
7649 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7650 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7652 const size_t M( A.rows() );
7653 const size_t N( B.columns() );
7655 for(
size_t i=0UL; i<M; ++i )
7657 const size_t jbegin( ( IsUpper_v<MT5> )
7658 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7660 const size_t jend( ( IsLower_v<MT5> )
7661 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7665 if( IsUpper_v<MT5> ) {
7666 for(
size_t j=0UL; j<jbegin; ++j ) {
7670 for(
size_t j=jbegin; j<jend; ++j ) {
7671 C(i,j) = A(i,i) * B(i,j) * scalar;
7673 if( IsLower_v<MT5> ) {
7674 for(
size_t j=jend; j<N; ++j ) {
7696 template<
typename MT3
7700 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7701 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7703 constexpr
size_t block( BLOCK_SIZE );
7705 const size_t M( A.rows() );
7706 const size_t N( B.columns() );
7708 for(
size_t jj=0UL; jj<N; jj+=block ) {
7709 const size_t jend(
min( N, jj+block ) );
7710 for(
size_t ii=0UL; ii<M; ii+=block ) {
7711 const size_t iend(
min( M, ii+block ) );
7712 for(
size_t j=jj; j<jend; ++j )
7714 const size_t ibegin( ( IsLower_v<MT5> )
7715 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
7717 const size_t ipos( ( IsUpper_v<MT5> )
7718 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
7721 if( IsLower_v<MT5> ) {
7722 for(
size_t i=ii; i<ibegin; ++i ) {
7726 for(
size_t i=ibegin; i<ipos; ++i ) {
7727 C(i,j) = A(i,i) * B(i,j) * scalar;
7729 if( IsUpper_v<MT5> ) {
7730 for(
size_t i=ipos; i<iend; ++i ) {
7754 template<
typename MT3
7758 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7759 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7763 for(
size_t i=0UL; i<A.rows(); ++i ) {
7764 C(i,i) = A(i,i) * B(i,i) * scalar;
7783 template<
typename MT3
7787 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7788 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7790 selectDefaultAssignKernel( C, A, B, scalar );
7809 template<
typename MT3
7813 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7814 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7816 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7818 const size_t M( A.rows() );
7819 const size_t N( B.columns() );
7820 const size_t K( A.columns() );
7824 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
7827 const SIMDType factor(
set( scalar ) );
7829 if( LOW && UPP && N >
SIMDSIZE*3UL ) {
7836 if( IsIntegral_v<ElementType> )
7838 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
7839 for(
size_t i=0UL; i<M; ++i )
7841 const size_t kbegin( ( IsUpper_v<MT4> )
7842 ?( ( IsLower_v<MT5> )
7843 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7844 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7845 :( IsLower_v<MT5> ? j : 0UL ) );
7846 const size_t kend( ( IsLower_v<MT4> )
7847 ?( ( IsUpper_v<MT5> )
7848 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
7849 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7850 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
7852 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7854 for(
size_t k=kbegin; k<kend; ++k ) {
7855 const SIMDType a1(
set( A(i,k) ) );
7856 xmm1 += a1 * B.load(k,j );
7857 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7858 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7859 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7860 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7861 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
7862 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
7863 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
7866 C.store( i, j , xmm1 * factor );
7867 C.store( i, j+
SIMDSIZE , xmm2 * factor );
7868 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
7869 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
7870 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
7871 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
7872 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
7873 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
7878 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
7882 for( ; (i+2UL) <= M; i+=2UL )
7884 const size_t kbegin( ( IsUpper_v<MT4> )
7885 ?( ( IsLower_v<MT5> )
7886 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7887 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7888 :( IsLower_v<MT5> ? j : 0UL ) );
7889 const size_t kend( ( IsLower_v<MT4> )
7890 ?( ( IsUpper_v<MT5> )
7891 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
7892 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7893 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
7895 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7897 for(
size_t k=kbegin; k<kend; ++k ) {
7898 const SIMDType a1(
set( A(i ,k) ) );
7899 const SIMDType a2(
set( A(i+1UL,k) ) );
7900 const SIMDType b1( B.load(k,j ) );
7901 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7902 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7903 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7904 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
7917 C.store( i , j , xmm1 * factor );
7918 C.store( i , j+
SIMDSIZE , xmm2 * factor );
7919 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
7920 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
7921 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
7922 C.store( i+1UL, j , xmm6 * factor );
7923 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
7924 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
7925 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
7926 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
7931 const size_t kbegin( ( IsUpper_v<MT4> )
7932 ?( ( IsLower_v<MT5> )
7933 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7934 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7935 :( IsLower_v<MT5> ? j : 0UL ) );
7936 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
7938 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7940 for(
size_t k=kbegin; k<kend; ++k ) {
7941 const SIMDType a1(
set( A(i,k) ) );
7942 xmm1 += a1 * B.load(k,j );
7943 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7944 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7945 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7946 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7949 C.store( i, j , xmm1 * factor );
7950 C.store( i, j+
SIMDSIZE , xmm2 * factor );
7951 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
7952 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
7953 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
7959 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
7960 size_t i( LOW ? j : 0UL );
7962 for( ; (i+2UL) <= iend; i+=2UL )
7964 const size_t kbegin( ( IsUpper_v<MT4> )
7965 ?( ( IsLower_v<MT5> )
7966 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7967 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7968 :( IsLower_v<MT5> ? j : 0UL ) );
7969 const size_t kend( ( IsLower_v<MT4> )
7970 ?( ( IsUpper_v<MT5> )
7971 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
7972 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7973 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
7975 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7977 for(
size_t k=kbegin; k<kend; ++k ) {
7978 const SIMDType a1(
set( A(i ,k) ) );
7979 const SIMDType a2(
set( A(i+1UL,k) ) );
7980 const SIMDType b1( B.load(k,j ) );
7981 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7982 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7983 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7994 C.store( i , j , xmm1 * factor );
7995 C.store( i , j+
SIMDSIZE , xmm2 * factor );
7996 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
7997 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
7998 C.store( i+1UL, j , xmm5 * factor );
7999 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
8000 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
8001 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
8006 const size_t kbegin( ( IsUpper_v<MT4> )
8007 ?( ( IsLower_v<MT5> )
8008 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8009 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8010 :( IsLower_v<MT5> ? j : 0UL ) );
8011 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
8013 SIMDType xmm1, xmm2, xmm3, xmm4;
8015 for(
size_t k=kbegin; k<kend; ++k ) {
8016 const SIMDType a1(
set( A(i,k) ) );
8017 xmm1 += a1 * B.load(k,j );
8018 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8019 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8020 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8023 C.store( i, j , xmm1 * factor );
8024 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8025 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8026 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8032 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
8033 size_t i( LOW ? j : 0UL );
8035 for( ; (i+2UL) <= iend; i+=2UL )
8037 const size_t kbegin( ( IsUpper_v<MT4> )
8038 ?( ( IsLower_v<MT5> )
8039 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8040 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8041 :( IsLower_v<MT5> ? j : 0UL ) );
8042 const size_t kend( ( IsLower_v<MT4> )
8043 ?( ( IsUpper_v<MT5> )
8044 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
8045 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8046 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
8048 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8050 for(
size_t k=kbegin; k<kend; ++k ) {
8051 const SIMDType a1(
set( A(i ,k) ) );
8052 const SIMDType a2(
set( A(i+1UL,k) ) );
8053 const SIMDType b1( B.load(k,j ) );
8054 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8055 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8064 C.store( i , j , xmm1 * factor );
8065 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8066 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8067 C.store( i+1UL, j , xmm4 * factor );
8068 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
8069 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
8074 const size_t kbegin( ( IsUpper_v<MT4> )
8075 ?( ( IsLower_v<MT5> )
8076 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8077 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8078 :( IsLower_v<MT5> ? j : 0UL ) );
8079 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
8081 SIMDType xmm1, xmm2, xmm3;
8083 for(
size_t k=kbegin; k<kend; ++k ) {
8084 const SIMDType a1(
set( A(i,k) ) );
8085 xmm1 += a1 * B.load(k,j );
8086 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8087 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8090 C.store( i, j , xmm1 * factor );
8091 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8092 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8098 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
8099 size_t i( LOW ? j : 0UL );
8101 for( ; (i+4UL) <= iend; i+=4UL )
8103 const size_t kbegin( ( IsUpper_v<MT4> )
8104 ?( ( IsLower_v<MT5> )
8105 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8106 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8107 :( IsLower_v<MT5> ? j : 0UL ) );
8108 const size_t kend( ( IsLower_v<MT4> )
8109 ?( ( IsUpper_v<MT5> )
8110 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
8111 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8112 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8114 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8116 for(
size_t k=kbegin; k<kend; ++k ) {
8117 const SIMDType a1(
set( A(i ,k) ) );
8118 const SIMDType a2(
set( A(i+1UL,k) ) );
8119 const SIMDType a3(
set( A(i+2UL,k) ) );
8120 const SIMDType a4(
set( A(i+3UL,k) ) );
8121 const SIMDType b1( B.load(k,j ) );
8122 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8133 C.store( i , j , xmm1 * factor );
8134 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8135 C.store( i+1UL, j , xmm3 * factor );
8136 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8137 C.store( i+2UL, j , xmm5 * factor );
8138 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8139 C.store( i+3UL, j , xmm7 * factor );
8140 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
8143 for( ; (i+3UL) <= iend; i+=3UL )
8145 const size_t kbegin( ( IsUpper_v<MT4> )
8146 ?( ( IsLower_v<MT5> )
8147 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8148 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8149 :( IsLower_v<MT5> ? j : 0UL ) );
8150 const size_t kend( ( IsLower_v<MT4> )
8151 ?( ( IsUpper_v<MT5> )
8152 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
8153 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8154 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8156 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8158 for(
size_t k=kbegin; k<kend; ++k ) {
8159 const SIMDType a1(
set( A(i ,k) ) );
8160 const SIMDType a2(
set( A(i+1UL,k) ) );
8161 const SIMDType a3(
set( A(i+2UL,k) ) );
8162 const SIMDType b1( B.load(k,j ) );
8163 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8172 C.store( i , j , xmm1 * factor );
8173 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8174 C.store( i+1UL, j , xmm3 * factor );
8175 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8176 C.store( i+2UL, j , xmm5 * factor );
8177 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8180 for( ; (i+2UL) <= iend; i+=2UL )
8182 const size_t kbegin( ( IsUpper_v<MT4> )
8183 ?( ( IsLower_v<MT5> )
8184 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8185 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8186 :( IsLower_v<MT5> ? j : 0UL ) );
8187 const size_t kend( ( IsLower_v<MT4> )
8188 ?( ( IsUpper_v<MT5> )
8189 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
8190 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8191 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8193 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8196 for( ; (k+2UL) <= kend; k+=2UL ) {
8197 const SIMDType a1(
set( A(i ,k ) ) );
8198 const SIMDType a2(
set( A(i+1UL,k ) ) );
8199 const SIMDType a3(
set( A(i ,k+1UL) ) );
8200 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
8201 const SIMDType b1( B.load(k ,j ) );
8202 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
8203 const SIMDType b3( B.load(k+1UL,j ) );
8204 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
8215 for( ; k<kend; ++k ) {
8216 const SIMDType a1(
set( A(i ,k) ) );
8217 const SIMDType a2(
set( A(i+1UL,k) ) );
8218 const SIMDType b1( B.load(k,j ) );
8219 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8226 C.store( i , j , (xmm1+xmm5) * factor );
8227 C.store( i , j+
SIMDSIZE, (xmm2+xmm6) * factor );
8228 C.store( i+1UL, j , (xmm3+xmm7) * factor );
8229 C.store( i+1UL, j+
SIMDSIZE, (xmm4+xmm8) * factor );
8234 const size_t kbegin( ( IsUpper_v<MT4> )
8235 ?( ( IsLower_v<MT5> )
8236 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8237 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8238 :( IsLower_v<MT5> ? j : 0UL ) );
8239 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8241 SIMDType xmm1, xmm2, xmm3, xmm4;
8244 for( ; (k+2UL) <= kend; k+=2UL ) {
8245 const SIMDType a1(
set( A(i,k ) ) );
8246 const SIMDType a2(
set( A(i,k+1UL) ) );
8247 xmm1 += a1 * B.load(k ,j );
8248 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
8249 xmm3 += a2 * B.load(k+1UL,j );
8250 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
8253 for( ; k<kend; ++k ) {
8254 const SIMDType a1(
set( A(i,k) ) );
8255 xmm1 += a1 * B.load(k,j );
8259 C.store( i, j , (xmm1+xmm3) * factor );
8260 C.store( i, j+
SIMDSIZE, (xmm2+xmm4) * factor );
8266 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE,M) : M );
8267 size_t i( LOW ? j : 0UL );
8269 for( ; (i+4UL) <= iend; i+=4UL )
8271 const size_t kbegin( ( IsUpper_v<MT4> )
8272 ?( ( IsLower_v<MT5> )
8273 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8274 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8275 :( IsLower_v<MT5> ? j : 0UL ) );
8276 const size_t kend( ( IsLower_v<MT4> )
8277 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8280 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8283 for( ; (k+2UL) <= kend; k+=2UL ) {
8284 const SIMDType b1( B.load(k ,j) );
8285 const SIMDType b2( B.load(k+1UL,j) );
8286 xmm1 +=
set( A(i ,k ) ) * b1;
8287 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8288 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8289 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8290 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8291 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8292 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8293 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8296 for( ; k<kend; ++k ) {
8297 const SIMDType b1( B.load(k,j) );
8298 xmm1 +=
set( A(i ,k) ) * b1;
8299 xmm2 +=
set( A(i+1UL,k) ) * b1;
8300 xmm3 +=
set( A(i+2UL,k) ) * b1;
8301 xmm4 +=
set( A(i+3UL,k) ) * b1;
8304 C.store( i , j, (xmm1+xmm5) * factor );
8305 C.store( i+1UL, j, (xmm2+xmm6) * factor );
8306 C.store( i+2UL, j, (xmm3+xmm7) * factor );
8307 C.store( i+3UL, j, (xmm4+xmm8) * factor );
8310 for( ; (i+3UL) <= iend; i+=3UL )
8312 const size_t kbegin( ( IsUpper_v<MT4> )
8313 ?( ( IsLower_v<MT5> )
8314 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8315 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8316 :( IsLower_v<MT5> ? j : 0UL ) );
8317 const size_t kend( ( IsLower_v<MT4> )
8318 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8321 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8324 for( ; (k+2UL) <= kend; k+=2UL ) {
8325 const SIMDType b1( B.load(k ,j) );
8326 const SIMDType b2( B.load(k+1UL,j) );
8327 xmm1 +=
set( A(i ,k ) ) * b1;
8328 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8329 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8330 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8331 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8332 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8335 for( ; k<kend; ++k ) {
8336 const SIMDType b1( B.load(k,j) );
8337 xmm1 +=
set( A(i ,k) ) * b1;
8338 xmm2 +=
set( A(i+1UL,k) ) * b1;
8339 xmm3 +=
set( A(i+2UL,k) ) * b1;
8342 C.store( i , j, (xmm1+xmm4) * factor );
8343 C.store( i+1UL, j, (xmm2+xmm5) * factor );
8344 C.store( i+2UL, j, (xmm3+xmm6) * factor );
8347 for( ; (i+2UL) <= iend; i+=2UL )
8349 const size_t kbegin( ( IsUpper_v<MT4> )
8350 ?( ( IsLower_v<MT5> )
8351 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8352 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8353 :( IsLower_v<MT5> ? j : 0UL ) );
8354 const size_t kend( ( IsLower_v<MT4> )
8355 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8358 SIMDType xmm1, xmm2, xmm3, xmm4;
8361 for( ; (k+2UL) <= kend; k+=2UL ) {
8362 const SIMDType b1( B.load(k ,j) );
8363 const SIMDType b2( B.load(k+1UL,j) );
8364 xmm1 +=
set( A(i ,k ) ) * b1;
8365 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8366 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8367 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8370 for( ; k<kend; ++k ) {
8371 const SIMDType b1( B.load(k,j) );
8372 xmm1 +=
set( A(i ,k) ) * b1;
8373 xmm2 +=
set( A(i+1UL,k) ) * b1;
8376 C.store( i , j, (xmm1+xmm3) * factor );
8377 C.store( i+1UL, j, (xmm2+xmm4) * factor );
8382 const size_t kbegin( ( IsUpper_v<MT4> )
8383 ?( ( IsLower_v<MT5> )
8384 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8385 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8386 :( IsLower_v<MT5> ? j : 0UL ) );
8388 SIMDType xmm1, xmm2;
8391 for( ; (k+2UL) <= K; k+=2UL ) {
8392 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8393 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8397 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8400 C.store( i, j, (xmm1+xmm2) * factor );
8404 for( ; remainder && j<N; ++j )
8406 size_t i( LOW && UPP ? j : 0UL );
8408 for( ; (i+2UL) <= M; i+=2UL )
8410 const size_t kbegin( ( IsUpper_v<MT4> )
8411 ?( ( IsLower_v<MT5> )
8412 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8413 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8414 :( IsLower_v<MT5> ? j : 0UL ) );
8415 const size_t kend( ( IsLower_v<MT4> )
8416 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8422 for(
size_t k=kbegin; k<kend; ++k ) {
8423 value1 += A(i ,k) * B(k,j);
8424 value2 += A(i+1UL,k) * B(k,j);
8427 C(i ,j) = value1 * scalar;
8428 C(i+1UL,j) = value2 * scalar;
8433 const size_t kbegin( ( IsUpper_v<MT4> )
8434 ?( ( IsLower_v<MT5> )
8435 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8436 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8437 :( IsLower_v<MT5> ? j : 0UL ) );
8441 for(
size_t k=kbegin; k<K; ++k ) {
8442 value += A(i,k) * B(k,j);
8445 C(i,j) = value * scalar;
8450 if( ( SYM || HERM ) && ( N >
SIMDSIZE*4UL ) ) {
8451 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
8453 for(
size_t j=0UL; j<jend; ++j ) {
8454 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
8458 else if( LOW && !UPP && N >
SIMDSIZE*4UL ) {
8459 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
8461 for(
size_t i=0UL; i<iend; ++i ) {
8466 else if( !LOW && UPP && N >
SIMDSIZE*4UL ) {
8467 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
8469 for(
size_t j=0UL; j<jend; ++j ) {
8492 template<
typename MT3
8496 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8497 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8499 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8501 const size_t M( A.rows() );
8502 const size_t N( B.columns() );
8503 const size_t K( A.columns() );
8507 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
8510 const SIMDType factor(
set( scalar ) );
8512 if( LOW && UPP && M >
SIMDSIZE*3UL ) {
8519 if( IsIntegral_v<ElementType> )
8521 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
8522 for(
size_t j=0UL; j<N; ++j )
8524 const size_t kbegin( ( IsLower_v<MT5> )
8525 ?( ( IsUpper_v<MT4> )
8526 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8527 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8528 :( IsUpper_v<MT4> ? i : 0UL ) );
8529 const size_t kend( ( IsUpper_v<MT5> )
8530 ?( ( IsLower_v<MT4> )
8531 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8532 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8533 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
8535 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8537 for(
size_t k=kbegin; k<kend; ++k ) {
8538 const SIMDType b1(
set( B(k,j) ) );
8539 xmm1 += A.load(i ,k) * b1;
8540 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8541 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8542 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8543 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8544 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
8545 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
8546 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
8549 C.store( i , j, xmm1 * factor );
8550 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8551 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8552 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
8553 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
8554 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
8555 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
8556 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
8561 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
8565 for( ; (j+2UL) <= N; j+=2UL )
8567 const size_t kbegin( ( IsLower_v<MT5> )
8568 ?( ( IsUpper_v<MT4> )
8569 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8570 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8571 :( IsUpper_v<MT4> ? i : 0UL ) );
8572 const size_t kend( ( IsUpper_v<MT5> )
8573 ?( ( IsLower_v<MT4> )
8574 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8575 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8576 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
8578 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8580 for(
size_t k=kbegin; k<kend; ++k ) {
8581 const SIMDType a1( A.load(i ,k) );
8582 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8583 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8584 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8585 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
8586 const SIMDType b1(
set( B(k,j ) ) );
8587 const SIMDType b2(
set( B(k,j+1UL) ) );
8600 C.store( i , j , xmm1 * factor );
8601 C.store( i+
SIMDSIZE , j , xmm2 * factor );
8602 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
8603 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
8604 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
8605 C.store( i , j+1UL, xmm6 * factor );
8606 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
8607 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8608 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8609 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8614 const size_t kbegin( ( IsLower_v<MT5> )
8615 ?( ( IsUpper_v<MT4> )
8616 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8617 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8618 :( IsUpper_v<MT4> ? i : 0UL ) );
8619 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
8621 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8623 for(
size_t k=kbegin; k<kend; ++k ) {
8624 const SIMDType b1(
set( B(k,j) ) );
8625 xmm1 += A.load(i ,k) * b1;
8626 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8627 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8628 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8629 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8632 C.store( i , j, xmm1 * factor );
8633 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8634 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8635 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
8636 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
8642 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
8643 size_t j( UPP ? i : 0UL );
8645 for( ; (j+2UL) <= jend; j+=2UL )
8647 const size_t kbegin( ( IsLower_v<MT5> )
8648 ?( ( IsUpper_v<MT4> )
8649 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8650 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8651 :( IsUpper_v<MT4> ? i : 0UL ) );
8652 const size_t kend( ( IsUpper_v<MT5> )
8653 ?( ( IsLower_v<MT4> )
8654 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8655 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8656 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
8658 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8660 for(
size_t k=kbegin; k<kend; ++k ) {
8661 const SIMDType a1( A.load(i ,k) );
8662 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8663 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8664 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8665 const SIMDType b1(
set( B(k,j ) ) );
8666 const SIMDType b2(
set( B(k,j+1UL) ) );
8677 C.store( i , j , xmm1 * factor );
8678 C.store( i+
SIMDSIZE , j , xmm2 * factor );
8679 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
8680 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
8681 C.store( i , j+1UL, xmm5 * factor );
8682 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
8683 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8684 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8689 const size_t kbegin( ( IsLower_v<MT5> )
8690 ?( ( IsUpper_v<MT4> )
8691 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8692 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8693 :( IsUpper_v<MT4> ? i : 0UL ) );
8694 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
8696 SIMDType xmm1, xmm2, xmm3, xmm4;
8698 for(
size_t k=kbegin; k<kend; ++k ) {
8699 const SIMDType b1(
set( B(k,j) ) );
8700 xmm1 += A.load(i ,k) * b1;
8701 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8702 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8703 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8706 C.store( i , j, xmm1 * factor );
8707 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8708 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8709 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
8715 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
8716 size_t j( UPP ? i : 0UL );
8718 for( ; (j+2UL) <= jend; j+=2UL )
8720 const size_t kbegin( ( IsLower_v<MT5> )
8721 ?( ( IsUpper_v<MT4> )
8722 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8723 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8724 :( IsUpper_v<MT4> ? i : 0UL ) );
8725 const size_t kend( ( IsUpper_v<MT5> )
8726 ?( ( IsLower_v<MT4> )
8727 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8728 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8729 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
8731 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8733 for(
size_t k=kbegin; k<kend; ++k ) {
8734 const SIMDType a1( A.load(i ,k) );
8735 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8736 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8737 const SIMDType b1(
set( B(k,j ) ) );
8738 const SIMDType b2(
set( B(k,j+1UL) ) );
8747 C.store( i , j , xmm1 * factor );
8748 C.store( i+
SIMDSIZE , j , xmm2 * factor );
8749 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
8750 C.store( i , j+1UL, xmm4 * factor );
8751 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
8752 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
8757 const size_t kbegin( ( IsLower_v<MT5> )
8758 ?( ( IsUpper_v<MT4> )
8759 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8760 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8761 :( IsUpper_v<MT4> ? i : 0UL ) );
8762 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
8764 SIMDType xmm1, xmm2, xmm3;
8766 for(
size_t k=kbegin; k<kend; ++k ) {
8767 const SIMDType b1(
set( B(k,j) ) );
8768 xmm1 += A.load(i ,k) * b1;
8769 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8770 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8773 C.store( i , j, xmm1 * factor );
8774 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8775 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8781 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
8782 size_t j( UPP ? i : 0UL );
8784 for( ; (j+4UL) <= jend; j+=4UL )
8786 const size_t kbegin( ( IsLower_v<MT5> )
8787 ?( ( IsUpper_v<MT4> )
8788 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8789 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8790 :( IsUpper_v<MT4> ? i : 0UL ) );
8791 const size_t kend( ( IsUpper_v<MT5> )
8792 ?( ( IsLower_v<MT4> )
8793 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8794 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8795 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8797 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8799 for(
size_t k=kbegin; k<kend; ++k ) {
8800 const SIMDType a1( A.load(i ,k) );
8801 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8802 const SIMDType b1(
set( B(k,j ) ) );
8803 const SIMDType b2(
set( B(k,j+1UL) ) );
8804 const SIMDType b3(
set( B(k,j+2UL) ) );
8805 const SIMDType b4(
set( B(k,j+3UL) ) );
8816 C.store( i , j , xmm1 * factor );
8817 C.store( i+
SIMDSIZE, j , xmm2 * factor );
8818 C.store( i , j+1UL, xmm3 * factor );
8819 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
8820 C.store( i , j+2UL, xmm5 * factor );
8821 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
8822 C.store( i , j+3UL, xmm7 * factor );
8823 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
8826 for( ; (j+3UL) <= jend; j+=3UL )
8828 const size_t kbegin( ( IsLower_v<MT5> )
8829 ?( ( IsUpper_v<MT4> )
8830 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8831 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8832 :( IsUpper_v<MT4> ? i : 0UL ) );
8833 const size_t kend( ( IsUpper_v<MT5> )
8834 ?( ( IsLower_v<MT4> )
8835 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8836 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8837 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8839 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8841 for(
size_t k=kbegin; k<kend; ++k ) {
8842 const SIMDType a1( A.load(i ,k) );
8843 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8844 const SIMDType b1(
set( B(k,j ) ) );
8845 const SIMDType b2(
set( B(k,j+1UL) ) );
8846 const SIMDType b3(
set( B(k,j+2UL) ) );
8855 C.store( i , j , xmm1 * factor );
8856 C.store( i+
SIMDSIZE, j , xmm2 * factor );
8857 C.store( i , j+1UL, xmm3 * factor );
8858 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
8859 C.store( i , j+2UL, xmm5 * factor );
8860 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
8863 for( ; (j+2UL) <= jend; j+=2UL )
8865 const size_t kbegin( ( IsLower_v<MT5> )
8866 ?( ( IsUpper_v<MT4> )
8867 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8868 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8869 :( IsUpper_v<MT4> ? i : 0UL ) );
8870 const size_t kend( ( IsUpper_v<MT5> )
8871 ?( ( IsLower_v<MT4> )
8872 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8873 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8874 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8876 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8879 for( ; (k+2UL) <= kend; k+=2UL ) {
8880 const SIMDType a1( A.load(i ,k ) );
8881 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
8882 const SIMDType a3( A.load(i ,k+1UL) );
8883 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
8884 const SIMDType b1(
set( B(k ,j ) ) );
8885 const SIMDType b2(
set( B(k ,j+1UL) ) );
8886 const SIMDType b3(
set( B(k+1UL,j ) ) );
8887 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8898 for( ; k<kend; ++k ) {
8899 const SIMDType a1( A.load(i ,k) );
8900 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8901 const SIMDType b1(
set( B(k,j ) ) );
8902 const SIMDType b2(
set( B(k,j+1UL) ) );
8909 C.store( i , j , (xmm1+xmm5) * factor );
8910 C.store( i+
SIMDSIZE, j , (xmm2+xmm6) * factor );
8911 C.store( i , j+1UL, (xmm3+xmm7) * factor );
8912 C.store( i+
SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
8917 const size_t kbegin( ( IsLower_v<MT5> )
8918 ?( ( IsUpper_v<MT4> )
8919 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8920 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8921 :( IsUpper_v<MT4> ? i : 0UL ) );
8922 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
8924 SIMDType xmm1, xmm2, xmm3, xmm4;
8927 for( ; (k+2UL) <= kend; k+=2UL ) {
8928 const SIMDType b1(
set( B(k ,j) ) );
8929 const SIMDType b2(
set( B(k+1UL,j) ) );
8930 xmm1 += A.load(i ,k ) * b1;
8931 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
8932 xmm3 += A.load(i ,k+1UL) * b2;
8933 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
8936 for( ; k<kend; ++k ) {
8937 const SIMDType b1(
set( B(k,j) ) );
8938 xmm1 += A.load(i ,k) * b1;
8942 C.store( i , j, (xmm1+xmm3) * factor );
8943 C.store( i+
SIMDSIZE, j, (xmm2+xmm4) * factor );
8949 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE,N) : N );
8950 size_t j( UPP ? i : 0UL );
8952 for( ; (j+4UL) <= jend; j+=4UL )
8954 const size_t kbegin( ( IsLower_v<MT5> )
8955 ?( ( IsUpper_v<MT4> )
8956 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8957 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8958 :( IsUpper_v<MT4> ? i : 0UL ) );
8959 const size_t kend( ( IsUpper_v<MT5> )
8960 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8963 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8966 for( ; (k+2UL) <= kend; k+=2UL ) {
8967 const SIMDType a1( A.load(i,k ) );
8968 const SIMDType a2( A.load(i,k+1UL) );
8969 xmm1 += a1 *
set( B(k ,j ) );
8970 xmm2 += a1 *
set( B(k ,j+1UL) );
8971 xmm3 += a1 *
set( B(k ,j+2UL) );
8972 xmm4 += a1 *
set( B(k ,j+3UL) );
8973 xmm5 += a2 *
set( B(k+1UL,j ) );
8974 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8975 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8976 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8979 for( ; k<kend; ++k ) {
8980 const SIMDType a1( A.load(i,k) );
8981 xmm1 += a1 *
set( B(k,j ) );
8982 xmm2 += a1 *
set( B(k,j+1UL) );
8983 xmm3 += a1 *
set( B(k,j+2UL) );
8984 xmm4 += a1 *
set( B(k,j+3UL) );
8987 C.store( i, j , (xmm1+xmm5) * factor );
8988 C.store( i, j+1UL, (xmm2+xmm6) * factor );
8989 C.store( i, j+2UL, (xmm3+xmm7) * factor );
8990 C.store( i, j+3UL, (xmm4+xmm8) * factor );
8993 for( ; (j+3UL) <= jend; j+=3UL )
8995 const size_t kbegin( ( IsLower_v<MT5> )
8996 ?( ( IsUpper_v<MT4> )
8997 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8998 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8999 :( IsUpper_v<MT4> ? i : 0UL ) );
9000 const size_t kend( ( IsUpper_v<MT5> )
9001 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
9004 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9007 for( ; (k+2UL) <= kend; k+=2UL ) {
9008 const SIMDType a1( A.load(i,k ) );
9009 const SIMDType a2( A.load(i,k+1UL) );
9010 xmm1 += a1 *
set( B(k ,j ) );
9011 xmm2 += a1 *
set( B(k ,j+1UL) );
9012 xmm3 += a1 *
set( B(k ,j+2UL) );
9013 xmm4 += a2 *
set( B(k+1UL,j ) );
9014 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
9015 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
9018 for( ; k<kend; ++k ) {
9019 const SIMDType a1( A.load(i,k) );
9020 xmm1 += a1 *
set( B(k,j ) );
9021 xmm2 += a1 *
set( B(k,j+1UL) );
9022 xmm3 += a1 *
set( B(k,j+2UL) );
9025 C.store( i, j , (xmm1+xmm4) * factor );
9026 C.store( i, j+1UL, (xmm2+xmm5) * factor );
9027 C.store( i, j+2UL, (xmm3+xmm6) * factor );
9030 for( ; (j+2UL) <= jend; j+=2UL )
9032 const size_t kbegin( ( IsLower_v<MT5> )
9033 ?( ( IsUpper_v<MT4> )
9034 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9035 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9036 :( IsUpper_v<MT4> ? i : 0UL ) );
9037 const size_t kend( ( IsUpper_v<MT5> )
9038 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9041 SIMDType xmm1, xmm2, xmm3, xmm4;
9044 for( ; k<kend; ++k ) {
9045 const SIMDType a1( A.load(i,k) );
9046 xmm1 += a1 *
set( B(k,j ) );
9047 xmm2 += a1 *
set( B(k,j+1UL) );
9050 for( ; (k+2UL) <= kend; k+=2UL ) {
9051 const SIMDType a1( A.load(i,k ) );
9052 const SIMDType a2( A.load(i,k+1UL) );
9053 xmm1 += a1 *
set( B(k ,j ) );
9054 xmm2 += a1 *
set( B(k ,j+1UL) );
9055 xmm3 += a2 *
set( B(k+1UL,j ) );
9056 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
9059 C.store( i, j , (xmm1+xmm3) * factor );
9060 C.store( i, j+1UL, (xmm2+xmm4) * factor );
9065 const size_t kbegin( ( IsLower_v<MT5> )
9066 ?( ( IsUpper_v<MT4> )
9067 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9068 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9069 :( IsUpper_v<MT4> ? i : 0UL ) );
9071 SIMDType xmm1, xmm2;
9074 for( ; (k+2UL) <= K; k+=2UL ) {
9075 xmm1 += A.load(i,k ) *
set( B(k ,j) );
9076 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
9080 xmm1 += A.load(i,k) *
set( B(k,j) );
9083 C.store( i, j, (xmm1+xmm2) * factor );
9087 for( ; remainder && i<M; ++i )
9089 size_t j( LOW && UPP ? i : 0UL );
9091 for( ; (j+2UL) <= N; j+=2UL )
9093 const size_t kbegin( ( IsLower_v<MT5> )
9094 ?( ( IsUpper_v<MT4> )
9095 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9096 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9097 :( IsUpper_v<MT4> ? i : 0UL ) );
9098 const size_t kend( ( IsUpper_v<MT5> )
9099 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9105 for(
size_t k=kbegin; k<kend; ++k ) {
9106 value1 += A(i,k) * B(k,j );
9107 value2 += A(i,k) * B(k,j+1UL);
9110 C(i,j ) = value1 * scalar;
9111 C(i,j+1UL) = value2 * scalar;
9116 const size_t kbegin( ( IsLower_v<MT5> )
9117 ?( ( IsUpper_v<MT4> )
9118 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9119 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9120 :( IsUpper_v<MT4> ? i : 0UL ) );
9124 for(
size_t k=kbegin; k<K; ++k ) {
9125 value += A(i,k) * B(k,j);
9128 C(i,j) = value * scalar;
9133 if( ( SYM || HERM ) && ( M >
SIMDSIZE*4UL ) ) {
9134 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
9136 for(
size_t i=0UL; i<iend; ++i ) {
9137 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
9141 else if( LOW && !UPP && M >
SIMDSIZE*4UL ) {
9142 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
9144 for(
size_t i=0UL; i<iend; ++i ) {
9149 else if( !LOW && UPP && M >
SIMDSIZE*4UL ) {
9150 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
9152 for(
size_t j=0UL; j<jend; ++j ) {
9174 template<
typename MT3
9178 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9179 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9181 selectDefaultAssignKernel( C, A, B, scalar );
9200 template<
typename MT3
9204 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9205 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9208 smmm( C, A, B, scalar );
9210 hmmm( C, A, B, scalar );
9212 lmmm( C, A, B, scalar, ST2(0) );
9214 ummm( C, A, B, scalar, ST2(0) );
9216 mmm( C, A, B, scalar, ST2(0) );
9234 template<
typename MT3
9238 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9239 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9241 selectLargeAssignKernel( C, A, B, scalar );
9246 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 9260 template<
typename MT3
9264 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9265 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9267 using ET = ElementType_t<MT3>;
9269 if( IsTriangular_v<MT4> ) {
9271 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9273 else if( IsTriangular_v<MT5> ) {
9275 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9278 gemm( C, A, B,
ET(scalar),
ET(0) );
9296 template<
typename MT
9302 using TmpType = If_t< SO, ResultType, OppositeType >;
9314 const ForwardFunctor fwd;
9316 const TmpType tmp(
serial( rhs ) );
9317 assign( ~lhs, fwd( tmp ) );
9333 template<
typename MT
9335 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9342 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9343 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9345 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9359 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9374 template<
typename MT3
9378 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9380 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
9381 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
9382 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
9383 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9384 selectSmallAddAssignKernel( C, A, B, scalar );
9386 selectBlasAddAssignKernel( C, A, B, scalar );
9404 template<
typename MT3
9408 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9409 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9412 addAssign( C, tmp );
9430 template<
typename MT3
9434 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9435 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9437 constexpr
size_t block( BLOCK_SIZE );
9439 const size_t M( A.rows() );
9440 const size_t N( B.columns() );
9442 for(
size_t ii=0UL; ii<M; ii+=block ) {
9443 const size_t iend(
min( M, ii+block ) );
9444 for(
size_t jj=0UL; jj<N; jj+=block ) {
9445 const size_t jend(
min( N, jj+block ) );
9446 for(
size_t i=ii; i<iend; ++i )
9448 const size_t jbegin( ( IsUpper_v<MT4> )
9449 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
9451 const size_t jpos( ( IsLower_v<MT4> )
9452 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
9455 for(
size_t j=jbegin; j<jpos; ++j ) {
9456 C(i,j) += A(i,j) * B(j,j) * scalar;
9478 template<
typename MT3
9482 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9483 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9485 const size_t M( A.rows() );
9486 const size_t N( B.columns() );
9488 for(
size_t j=0UL; j<N; ++j )
9490 const size_t ibegin( ( IsLower_v<MT4> )
9491 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
9493 const size_t iend( ( IsUpper_v<MT4> )
9494 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
9498 const size_t inum( iend - ibegin );
9499 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9501 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9502 C(i ,j) += A(i ,j) * B(j,j) * scalar;
9503 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9506 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9526 template<
typename MT3
9530 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9531 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9533 const size_t M( A.rows() );
9534 const size_t N( B.columns() );
9536 for(
size_t i=0UL; i<M; ++i )
9538 const size_t jbegin( ( IsUpper_v<MT5> )
9539 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
9541 const size_t jend( ( IsLower_v<MT5> )
9542 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
9546 const size_t jnum( jend - jbegin );
9547 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9549 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9550 C(i,j ) += A(i,i) * B(i,j ) * scalar;
9551 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9554 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9574 template<
typename MT3
9578 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9579 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9581 constexpr
size_t block( BLOCK_SIZE );
9583 const size_t M( A.rows() );
9584 const size_t N( B.columns() );
9586 for(
size_t jj=0UL; jj<N; jj+=block ) {
9587 const size_t jend(
min( N, jj+block ) );
9588 for(
size_t ii=0UL; ii<M; ii+=block ) {
9589 const size_t iend(
min( M, ii+block ) );
9590 for(
size_t j=jj; j<jend; ++j )
9592 const size_t ibegin( ( IsLower_v<MT5> )
9593 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
9595 const size_t ipos( ( IsUpper_v<MT5> )
9596 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
9599 for(
size_t i=ibegin; i<ipos; ++i ) {
9600 C(i,j) += A(i,i) * B(i,j) * scalar;
9622 template<
typename MT3
9626 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9627 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9629 for(
size_t i=0UL; i<A.rows(); ++i ) {
9630 C(i,i) += A(i,i) * B(i,i) * scalar;
9649 template<
typename MT3
9653 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9654 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9656 selectDefaultAddAssignKernel( C, A, B, scalar );
9675 template<
typename MT3
9679 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9680 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9682 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
9684 const size_t M( A.rows() );
9685 const size_t N( B.columns() );
9686 const size_t K( A.columns() );
9690 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
9693 const SIMDType factor(
set( scalar ) );
9697 if( IsIntegral_v<ElementType> )
9700 for(
size_t i=0UL; i<M; ++i )
9702 const size_t kbegin( ( IsUpper_v<MT4> )
9703 ?( ( IsLower_v<MT5> )
9704 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9705 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9706 :( IsLower_v<MT5> ? j : 0UL ) );
9707 const size_t kend( ( IsLower_v<MT4> )
9708 ?( ( IsUpper_v<MT5> )
9709 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
9710 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
9711 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
9713 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9715 for(
size_t k=kbegin; k<kend; ++k ) {
9716 const SIMDType a1(
set( A(i,k) ) );
9717 xmm1 += a1 * B.load(k,j );
9718 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
9719 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
9720 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
9721 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
9722 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
9723 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
9724 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
9727 C.store( i, j , C.load(i,j ) + xmm1 * factor );
9743 for( ; (i+2UL) <= M; i+=2UL )
9745 const size_t kbegin( ( IsUpper_v<MT4> )
9746 ?( ( IsLower_v<MT5> )
9747 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9748 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9749 :( IsLower_v<MT5> ? j : 0UL ) );
9750 const size_t kend( ( IsLower_v<MT4> )
9751 ?( ( IsUpper_v<MT5> )
9752 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
9753 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9754 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
9756 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
9758 for(
size_t k=kbegin; k<kend; ++k ) {
9759 const SIMDType a1(
set( A(i ,k) ) );
9760 const SIMDType a2(
set( A(i+1UL,k) ) );
9761 const SIMDType b1( B.load(k,j ) );
9762 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
9763 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
9764 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
9765 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
9778 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9783 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
9785 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
9786 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
9787 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
9792 const size_t kbegin( ( IsUpper_v<MT4> )
9793 ?( ( IsLower_v<MT5> )
9794 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9795 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9796 :( IsLower_v<MT5> ? j : 0UL ) );
9797 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
9799 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
9801 for(
size_t k=kbegin; k<kend; ++k ) {
9802 const SIMDType a1(
set( A(i,k) ) );
9803 xmm1 += a1 * B.load(k,j );
9804 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
9805 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
9806 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
9807 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
9810 C.store( i, j , C.load(i,j ) + xmm1 * factor );
9822 for( ; (i+2UL) <= M; i+=2UL )
9824 const size_t kbegin( ( IsUpper_v<MT4> )
9825 ?( ( IsLower_v<MT5> )
9826 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9827 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9828 :( IsLower_v<MT5> ? j : 0UL ) );
9829 const size_t kend( ( IsLower_v<MT4> )
9830 ?( ( IsUpper_v<MT5> )
9831 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
9832 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9833 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
9835 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9837 for(
size_t k=kbegin; k<kend; ++k ) {
9838 const SIMDType a1(
set( A(i ,k) ) );
9839 const SIMDType a2(
set( A(i+1UL,k) ) );
9840 const SIMDType b1( B.load(k,j ) );
9841 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
9842 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
9843 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
9854 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9858 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
9860 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
9861 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
9866 const size_t kbegin( ( IsUpper_v<MT4> )
9867 ?( ( IsLower_v<MT5> )
9868 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9869 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9870 :( IsLower_v<MT5> ? j : 0UL ) );
9871 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
9873 SIMDType xmm1, xmm2, xmm3, xmm4;
9875 for(
size_t k=kbegin; k<kend; ++k ) {
9876 const SIMDType a1(
set( A(i,k) ) );
9877 xmm1 += a1 * B.load(k,j );
9878 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
9879 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
9880 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
9883 C.store( i, j , C.load(i,j ) + xmm1 * factor );
9894 for( ; (i+2UL) <= M; i+=2UL )
9896 const size_t kbegin( ( IsUpper_v<MT4> )
9897 ?( ( IsLower_v<MT5> )
9898 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9899 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9900 :( IsLower_v<MT5> ? j : 0UL ) );
9901 const size_t kend( ( IsLower_v<MT4> )
9902 ?( ( IsUpper_v<MT5> )
9903 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
9904 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
9905 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
9907 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9909 for(
size_t k=kbegin; k<kend; ++k ) {
9910 const SIMDType a1(
set( A(i ,k) ) );
9911 const SIMDType a2(
set( A(i+1UL,k) ) );
9912 const SIMDType b1( B.load(k,j ) );
9913 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
9914 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
9923 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9926 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
9928 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
9933 const size_t kbegin( ( IsUpper_v<MT4> )
9934 ?( ( IsLower_v<MT5> )
9935 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9936 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9937 :( IsLower_v<MT5> ? j : 0UL ) );
9938 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
9940 SIMDType xmm1, xmm2, xmm3;
9942 for(
size_t k=kbegin; k<kend; ++k ) {
9943 const SIMDType a1(
set( A(i,k) ) );
9944 xmm1 += a1 * B.load(k,j );
9945 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
9946 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
9949 C.store( i, j , C.load(i,j ) + xmm1 * factor );
9957 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
9958 size_t i( LOW ? j : 0UL );
9960 for( ; (i+4UL) <= iend; i+=4UL )
9962 const size_t kbegin( ( IsUpper_v<MT4> )
9963 ?( ( IsLower_v<MT5> )
9964 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9965 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9966 :( IsLower_v<MT5> ? j : 0UL ) );
9967 const size_t kend( ( IsLower_v<MT4> )
9968 ?( ( IsUpper_v<MT5> )
9969 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
9970 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
9971 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
9973 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9975 for(
size_t k=kbegin; k<kend; ++k ) {
9976 const SIMDType a1(
set( A(i ,k) ) );
9977 const SIMDType a2(
set( A(i+1UL,k) ) );
9978 const SIMDType a3(
set( A(i+2UL,k) ) );
9979 const SIMDType a4(
set( A(i+3UL,k) ) );
9980 const SIMDType b1( B.load(k,j ) );
9981 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
9992 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
9994 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
9996 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
9998 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
10002 for( ; (i+3UL) <= iend; i+=3UL )
10004 const size_t kbegin( ( IsUpper_v<MT4> )
10005 ?( ( IsLower_v<MT5> )
10006 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10007 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10008 :( IsLower_v<MT5> ? j : 0UL ) );
10009 const size_t kend( ( IsLower_v<MT4> )
10010 ?( ( IsUpper_v<MT5> )
10011 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
10012 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
10013 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
10015 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10017 for(
size_t k=kbegin; k<kend; ++k ) {
10018 const SIMDType a1(
set( A(i ,k) ) );
10019 const SIMDType a2(
set( A(i+1UL,k) ) );
10020 const SIMDType a3(
set( A(i+2UL,k) ) );
10021 const SIMDType b1( B.load(k,j ) );
10022 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
10031 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10033 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10035 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10039 for( ; (i+2UL) <= iend; i+=2UL )
10041 const size_t kbegin( ( IsUpper_v<MT4> )
10042 ?( ( IsLower_v<MT5> )
10043 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10044 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10045 :( IsLower_v<MT5> ? j : 0UL ) );
10046 const size_t kend( ( IsLower_v<MT4> )
10047 ?( ( IsUpper_v<MT5> )
10048 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
10049 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10050 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
10052 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10053 size_t k( kbegin );
10055 for( ; (k+2UL) <= kend; k+=2UL ) {
10056 const SIMDType a1(
set( A(i ,k ) ) );
10057 const SIMDType a2(
set( A(i+1UL,k ) ) );
10058 const SIMDType a3(
set( A(i ,k+1UL) ) );
10059 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
10060 const SIMDType b1( B.load(k ,j ) );
10061 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
10062 const SIMDType b3( B.load(k+1UL,j ) );
10063 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
10074 for( ; k<kend; ++k ) {
10075 const SIMDType a1(
set( A(i ,k) ) );
10076 const SIMDType a2(
set( A(i+1UL,k) ) );
10077 const SIMDType b1( B.load(k,j ) );
10078 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
10085 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10087 C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
10088 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) + (xmm4+xmm8) * factor );
10093 const size_t kbegin( ( IsUpper_v<MT4> )
10094 ?( ( IsLower_v<MT5> )
10095 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10096 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10097 :( IsLower_v<MT5> ? j : 0UL ) );
10098 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
10100 SIMDType xmm1, xmm2, xmm3, xmm4;
10101 size_t k( kbegin );
10103 for( ; (k+2UL) <= kend; k+=2UL ) {
10104 const SIMDType a1(
set( A(i,k ) ) );
10105 const SIMDType a2(
set( A(i,k+1UL) ) );
10106 xmm1 += a1 * B.load(k ,j );
10107 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
10108 xmm3 += a2 * B.load(k+1UL,j );
10109 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
10112 for( ; k<kend; ++k ) {
10113 const SIMDType a1(
set( A(i,k) ) );
10114 xmm1 += a1 * B.load(k,j );
10115 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
10118 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10125 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
10126 size_t i( LOW ? j : 0UL );
10128 for( ; (i+4UL) <= iend; i+=4UL )
10130 const size_t kbegin( ( IsUpper_v<MT4> )
10131 ?( ( IsLower_v<MT5> )
10132 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10133 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10134 :( IsLower_v<MT5> ? j : 0UL ) );
10135 const size_t kend( ( IsLower_v<MT4> )
10136 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
10139 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10140 size_t k( kbegin );
10142 for( ; (k+2UL) <= kend; k+=2UL ) {
10143 const SIMDType b1( B.load(k ,j) );
10144 const SIMDType b2( B.load(k+1UL,j) );
10145 xmm1 +=
set( A(i ,k ) ) * b1;
10146 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10147 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10148 xmm4 +=
set( A(i+3UL,k ) ) * b1;
10149 xmm5 +=
set( A(i ,k+1UL) ) * b2;
10150 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
10151 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
10152 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
10155 for( ; k<kend; ++k ) {
10156 const SIMDType b1( B.load(k,j) );
10157 xmm1 +=
set( A(i ,k) ) * b1;
10158 xmm2 +=
set( A(i+1UL,k) ) * b1;
10159 xmm3 +=
set( A(i+2UL,k) ) * b1;
10160 xmm4 +=
set( A(i+3UL,k) ) * b1;
10163 C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
10164 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
10165 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
10166 C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
10169 for( ; (i+3UL) <= iend; i+=3UL )
10171 const size_t kbegin( ( IsUpper_v<MT4> )
10172 ?( ( IsLower_v<MT5> )
10173 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10174 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10175 :( IsLower_v<MT5> ? j : 0UL ) );
10176 const size_t kend( ( IsLower_v<MT4> )
10177 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
10180 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10181 size_t k( kbegin );
10183 for( ; (k+2UL) <= kend; k+=2UL ) {
10184 const SIMDType b1( B.load(k ,j) );
10185 const SIMDType b2( B.load(k+1UL,j) );
10186 xmm1 +=
set( A(i ,k ) ) * b1;
10187 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10188 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10189 xmm4 +=
set( A(i ,k+1UL) ) * b2;
10190 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
10191 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
10194 for( ; k<kend; ++k ) {
10195 const SIMDType b1( B.load(k,j) );
10196 xmm1 +=
set( A(i ,k) ) * b1;
10197 xmm2 +=
set( A(i+1UL,k) ) * b1;
10198 xmm3 +=
set( A(i+2UL,k) ) * b1;
10201 C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
10202 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
10203 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
10206 for( ; (i+2UL) <= iend; i+=2UL )
10208 const size_t kbegin( ( IsUpper_v<MT4> )
10209 ?( ( IsLower_v<MT5> )
10210 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10211 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10212 :( IsLower_v<MT5> ? j : 0UL ) );
10213 const size_t kend( ( IsLower_v<MT4> )
10214 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10217 SIMDType xmm1, xmm2, xmm3, xmm4;
10218 size_t k( kbegin );
10220 for( ; (k+2UL) <= kend; k+=2UL ) {
10221 const SIMDType b1( B.load(k ,j) );
10222 const SIMDType b2( B.load(k+1UL,j) );
10223 xmm1 +=
set( A(i ,k ) ) * b1;
10224 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10225 xmm3 +=
set( A(i ,k+1UL) ) * b2;
10226 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
10229 for( ; k<kend; ++k ) {
10230 const SIMDType b1( B.load(k,j) );
10231 xmm1 +=
set( A(i ,k) ) * b1;
10232 xmm2 +=
set( A(i+1UL,k) ) * b1;
10235 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10236 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
10241 const size_t kbegin( ( IsUpper_v<MT4> )
10242 ?( ( IsLower_v<MT5> )
10243 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10244 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10245 :( IsLower_v<MT5> ? j : 0UL ) );
10247 SIMDType xmm1, xmm2;
10248 size_t k( kbegin );
10250 for( ; (k+2UL) <= K; k+=2UL ) {
10251 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
10252 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
10255 for( ; k<K; ++k ) {
10256 xmm1 +=
set( A(i,k) ) * B.load(k,j);
10259 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10263 for( ; remainder && j<N; ++j )
10265 const size_t iend( UPP ? j+1UL : M );
10266 size_t i( LOW ? j : 0UL );
10268 for( ; (i+2UL) <= iend; i+=2UL )
10270 const size_t kbegin( ( IsUpper_v<MT4> )
10271 ?( ( IsLower_v<MT5> )
10272 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10273 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10274 :( IsLower_v<MT5> ? j : 0UL ) );
10275 const size_t kend( ( IsLower_v<MT4> )
10276 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10282 for(
size_t k=kbegin; k<kend; ++k ) {
10283 value1 += A(i ,k) * B(k,j);
10284 value2 += A(i+1UL,k) * B(k,j);
10287 C(i ,j) += value1 * scalar;
10288 C(i+1UL,j) += value2 * scalar;
10293 const size_t kbegin( ( IsUpper_v<MT4> )
10294 ?( ( IsLower_v<MT5> )
10295 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10296 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10297 :( IsLower_v<MT5> ? j : 0UL ) );
10301 for(
size_t k=kbegin; k<K; ++k ) {
10302 value += A(i,k) * B(k,j);
10305 C(i,j) += value * scalar;
10326 template<
typename MT3
10330 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10331 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10333 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
10335 const size_t M( A.rows() );
10336 const size_t N( B.columns() );
10337 const size_t K( A.columns() );
10341 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
10344 const SIMDType factor(
set( scalar ) );
10348 if( IsIntegral_v<ElementType> )
10351 for(
size_t j=0UL; j<N; ++j )
10353 const size_t kbegin( ( IsLower_v<MT5> )
10354 ?( ( IsUpper_v<MT4> )
10355 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10356 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10357 :( IsUpper_v<MT4> ? i : 0UL ) );
10358 const size_t kend( ( IsUpper_v<MT5> )
10359 ?( ( IsLower_v<MT4> )
10360 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
10361 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
10362 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
10364 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10366 for(
size_t k=kbegin; k<kend; ++k ) {
10367 const SIMDType b1(
set( B(k,j) ) );
10368 xmm1 += A.load(i ,k) * b1;
10369 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10370 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10371 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10372 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
10373 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
10374 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
10375 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
10378 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10394 for( ; (j+2UL) <= N; j+=2UL )
10396 const size_t kbegin( ( IsLower_v<MT5> )
10397 ?( ( IsUpper_v<MT4> )
10398 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10399 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10400 :( IsUpper_v<MT4> ? i : 0UL ) );
10401 const size_t kend( ( IsUpper_v<MT5> )
10402 ?( ( IsLower_v<MT4> )
10403 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10404 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10405 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
10407 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10409 for(
size_t k=kbegin; k<kend; ++k ) {
10410 const SIMDType a1( A.load(i ,k) );
10411 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10412 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10413 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
10414 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
10415 const SIMDType b1(
set( B(k,j ) ) );
10416 const SIMDType b2(
set( B(k,j+1UL) ) );
10429 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10434 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
10436 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10437 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10438 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10443 const size_t kbegin( ( IsLower_v<MT5> )
10444 ?( ( IsUpper_v<MT4> )
10445 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10446 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10447 :( IsUpper_v<MT4> ? i : 0UL ) );
10448 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
10450 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10452 for(
size_t k=kbegin; k<kend; ++k ) {
10453 const SIMDType b1(
set( B(k,j) ) );
10454 xmm1 += A.load(i ,k) * b1;
10455 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10456 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10457 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10458 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
10461 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10473 for( ; (j+2UL) <= N; j+=2UL )
10475 const size_t kbegin( ( IsLower_v<MT5> )
10476 ?( ( IsUpper_v<MT4> )
10477 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10478 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10479 :( IsUpper_v<MT4> ? i : 0UL ) );
10480 const size_t kend( ( IsUpper_v<MT5> )
10481 ?( ( IsLower_v<MT4> )
10482 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10483 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10484 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
10486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10488 for(
size_t k=kbegin; k<kend; ++k ) {
10489 const SIMDType a1( A.load(i ,k) );
10490 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10491 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10492 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
10493 const SIMDType b1(
set( B(k,j ) ) );
10494 const SIMDType b2(
set( B(k,j+1UL) ) );
10505 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10509 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
10511 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10512 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10517 const size_t kbegin( ( IsLower_v<MT5> )
10518 ?( ( IsUpper_v<MT4> )
10519 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10520 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10521 :( IsUpper_v<MT4> ? i : 0UL ) );
10522 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
10524 SIMDType xmm1, xmm2, xmm3, xmm4;
10526 for(
size_t k=kbegin; k<kend; ++k ) {
10527 const SIMDType b1(
set( B(k,j) ) );
10528 xmm1 += A.load(i ,k) * b1;
10529 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10530 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10531 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10534 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10545 for( ; (j+2UL) <= N; j+=2UL )
10547 const size_t kbegin( ( IsLower_v<MT5> )
10548 ?( ( IsUpper_v<MT4> )
10549 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10550 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10551 :( IsUpper_v<MT4> ? i : 0UL ) );
10552 const size_t kend( ( IsUpper_v<MT5> )
10553 ?( ( IsLower_v<MT4> )
10554 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10555 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10556 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
10558 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10560 for(
size_t k=kbegin; k<kend; ++k ) {
10561 const SIMDType a1( A.load(i ,k) );
10562 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10563 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10564 const SIMDType b1(
set( B(k,j ) ) );
10565 const SIMDType b2(
set( B(k,j+1UL) ) );
10574 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10577 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
10579 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10584 const size_t kbegin( ( IsLower_v<MT5> )
10585 ?( ( IsUpper_v<MT4> )
10586 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10587 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10588 :( IsUpper_v<MT4> ? i : 0UL ) );
10589 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
10591 SIMDType xmm1, xmm2, xmm3;
10593 for(
size_t k=kbegin; k<kend; ++k ) {
10594 const SIMDType b1(
set( B(k,j) ) );
10595 xmm1 += A.load(i ,k) * b1;
10596 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10597 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10600 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10608 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
10609 size_t j( UPP ? i : 0UL );
10611 for( ; (j+4UL) <= jend; j+=4UL )
10613 const size_t kbegin( ( IsLower_v<MT5> )
10614 ?( ( IsUpper_v<MT4> )
10615 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10616 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10617 :( IsUpper_v<MT4> ? i : 0UL ) );
10618 const size_t kend( ( IsUpper_v<MT5> )
10619 ?( ( IsLower_v<MT4> )
10620 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
10621 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
10622 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
10624 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10626 for(
size_t k=kbegin; k<kend; ++k ) {
10627 const SIMDType a1( A.load(i ,k) );
10628 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
10629 const SIMDType b1(
set( B(k,j ) ) );
10630 const SIMDType b2(
set( B(k,j+1UL) ) );
10631 const SIMDType b3(
set( B(k,j+2UL) ) );
10632 const SIMDType b4(
set( B(k,j+3UL) ) );
10643 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10645 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
10647 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
10649 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
10653 for( ; (j+3UL) <= jend; j+=3UL )
10655 const size_t kbegin( ( IsLower_v<MT5> )
10656 ?( ( IsUpper_v<MT4> )
10657 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10658 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10659 :( IsUpper_v<MT4> ? i : 0UL ) );
10660 const size_t kend( ( IsUpper_v<MT5> )
10661 ?( ( IsLower_v<MT4> )
10662 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
10663 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
10664 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
10666 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10668 for(
size_t k=kbegin; k<kend; ++k ) {
10669 const SIMDType a1( A.load(i ,k) );
10670 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
10671 const SIMDType b1(
set( B(k,j ) ) );
10672 const SIMDType b2(
set( B(k,j+1UL) ) );
10673 const SIMDType b3(
set( B(k,j+2UL) ) );
10682 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10684 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
10686 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
10690 for( ; (j+2UL) <= jend; j+=2UL )
10692 const size_t kbegin( ( IsLower_v<MT5> )
10693 ?( ( IsUpper_v<MT4> )
10694 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10695 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10696 :( IsUpper_v<MT4> ? i : 0UL ) );
10697 const size_t kend( ( IsUpper_v<MT5> )
10698 ?( ( IsLower_v<MT4> )
10699 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10700 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10701 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
10703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10704 size_t k( kbegin );
10706 for( ; (k+2UL) <= kend; k+=2UL ) {
10707 const SIMDType a1( A.load(i ,k ) );
10708 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
10709 const SIMDType a3( A.load(i ,k+1UL) );
10710 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
10711 const SIMDType b1(
set( B(k ,j ) ) );
10712 const SIMDType b2(
set( B(k ,j+1UL) ) );
10713 const SIMDType b3(
set( B(k+1UL,j ) ) );
10714 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
10725 for( ; k<kend; ++k ) {
10726 const SIMDType a1( A.load(i ,k) );
10727 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
10728 const SIMDType b1(
set( B(k,j ) ) );
10729 const SIMDType b2(
set( B(k,j+1UL) ) );
10736 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10738 C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
10739 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
10744 const size_t kbegin( ( IsLower_v<MT5> )
10745 ?( ( IsUpper_v<MT4> )
10746 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10747 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10748 :( IsUpper_v<MT4> ? i : 0UL ) );
10749 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
10751 SIMDType xmm1, xmm2, xmm3, xmm4;
10752 size_t k( kbegin );
10754 for( ; (k+2UL) <= kend; k+=2UL ) {
10755 const SIMDType b1(
set( B(k ,j) ) );
10756 const SIMDType b2(
set( B(k+1UL,j) ) );
10757 xmm1 += A.load(i ,k ) * b1;
10758 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
10759 xmm3 += A.load(i ,k+1UL) * b2;
10760 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
10763 for( ; k<kend; ++k ) {
10764 const SIMDType b1(
set( B(k,j) ) );
10765 xmm1 += A.load(i ,k) * b1;
10766 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
10769 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10776 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
10777 size_t j( UPP ? i : 0UL );
10779 for( ; (j+4UL) <= jend; j+=4UL )
10781 const size_t kbegin( ( IsLower_v<MT5> )
10782 ?( ( IsUpper_v<MT4> )
10783 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10784 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10785 :( IsUpper_v<MT4> ? i : 0UL ) );
10786 const size_t kend( ( IsUpper_v<MT5> )
10787 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
10790 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10791 size_t k( kbegin );
10793 for( ; (k+2UL) <= kend; k+=2UL ) {
10794 const SIMDType a1( A.load(i,k ) );
10795 const SIMDType a2( A.load(i,k+1UL) );
10796 xmm1 += a1 *
set( B(k ,j ) );
10797 xmm2 += a1 *
set( B(k ,j+1UL) );
10798 xmm3 += a1 *
set( B(k ,j+2UL) );
10799 xmm4 += a1 *
set( B(k ,j+3UL) );
10800 xmm5 += a2 *
set( B(k+1UL,j ) );
10801 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
10802 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
10803 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
10806 for( ; k<kend; ++k ) {
10807 const SIMDType a1( A.load(i,k) );
10808 xmm1 += a1 *
set( B(k,j ) );
10809 xmm2 += a1 *
set( B(k,j+1UL) );
10810 xmm3 += a1 *
set( B(k,j+2UL) );
10811 xmm4 += a1 *
set( B(k,j+3UL) );
10814 C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
10815 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
10816 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
10817 C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
10820 for( ; (j+3UL) <= jend; j+=3UL )
10822 const size_t kbegin( ( IsLower_v<MT5> )
10823 ?( ( IsUpper_v<MT4> )
10824 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10825 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10826 :( IsUpper_v<MT4> ? i : 0UL ) );
10827 const size_t kend( ( IsUpper_v<MT5> )
10828 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
10831 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10832 size_t k( kbegin );
10834 for( ; (k+2UL) <= kend; k+=2UL ) {
10835 const SIMDType a1( A.load(i,k ) );
10836 const SIMDType a2( A.load(i,k+1UL) );
10837 xmm1 += a1 *
set( B(k ,j ) );
10838 xmm2 += a1 *
set( B(k ,j+1UL) );
10839 xmm3 += a1 *
set( B(k ,j+2UL) );
10840 xmm4 += a2 *
set( B(k+1UL,j ) );
10841 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
10842 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
10845 for( ; k<kend; ++k ) {
10846 const SIMDType a1( A.load(i,k) );
10847 xmm1 += a1 *
set( B(k,j ) );
10848 xmm2 += a1 *
set( B(k,j+1UL) );
10849 xmm3 += a1 *
set( B(k,j+2UL) );
10852 C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
10853 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
10854 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
10857 for( ; (j+2UL) <= jend; j+=2UL )
10859 const size_t kbegin( ( IsLower_v<MT5> )
10860 ?( ( IsUpper_v<MT4> )
10861 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10862 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10863 :( IsUpper_v<MT4> ? i : 0UL ) );
10864 const size_t kend( ( IsUpper_v<MT5> )
10865 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10868 SIMDType xmm1, xmm2, xmm3, xmm4;
10869 size_t k( kbegin );
10871 for( ; (k+2UL) <= kend; k+=2UL ) {
10872 const SIMDType a1( A.load(i,k ) );
10873 const SIMDType a2( A.load(i,k+1UL) );
10874 xmm1 += a1 *
set( B(k ,j ) );
10875 xmm2 += a1 *
set( B(k ,j+1UL) );
10876 xmm3 += a2 *
set( B(k+1UL,j ) );
10877 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
10880 for( ; k<kend; ++k ) {
10881 const SIMDType a1( A.load(i,k) );
10882 xmm1 += a1 *
set( B(k,j ) );
10883 xmm2 += a1 *
set( B(k,j+1UL) );
10886 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10887 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
10892 const size_t kbegin( ( IsLower_v<MT5> )
10893 ?( ( IsUpper_v<MT4> )
10894 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10895 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10896 :( IsUpper_v<MT4> ? i : 0UL ) );
10898 SIMDType xmm1, xmm2;
10899 size_t k( kbegin );
10901 for( ; (k+2UL) <= K; k+=2UL ) {
10902 xmm1 += A.load(i,k ) *
set( B(k ,j) );
10903 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
10906 for( ; k<K; ++k ) {
10907 xmm1 += A.load(i,k) *
set( B(k,j) );
10910 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10914 for( ; remainder && i<M; ++i )
10916 const size_t jend( LOW ? i+1UL : N );
10917 size_t j( UPP ? i : 0UL );
10919 for( ; (j+2UL) <= jend; j+=2UL )
10921 const size_t kbegin( ( IsLower_v<MT5> )
10922 ?( ( IsUpper_v<MT4> )
10923 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10924 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10925 :( IsUpper_v<MT4> ? i : 0UL ) );
10926 const size_t kend( ( IsUpper_v<MT5> )
10927 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10933 for(
size_t k=kbegin; k<kend; ++k ) {
10934 value1 += A(i,k) * B(k,j );
10935 value2 += A(i,k) * B(k,j+1UL);
10938 C(i,j ) += value1 * scalar;
10939 C(i,j+1UL) += value2 * scalar;
10944 const size_t kbegin( ( IsLower_v<MT5> )
10945 ?( ( IsUpper_v<MT4> )
10946 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10947 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10948 :( IsUpper_v<MT4> ? i : 0UL ) );
10952 for(
size_t k=kbegin; k<K; ++k ) {
10953 value += A(i,k) * B(k,j);
10956 C(i,j) += value * scalar;
10976 template<
typename MT3
10980 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10981 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10983 selectDefaultAddAssignKernel( C, A, B, scalar );
11002 template<
typename MT3
11006 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11007 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11010 lmmm( C, A, B, scalar, ST2(1) );
11012 ummm( C, A, B, scalar, ST2(1) );
11014 mmm( C, A, B, scalar, ST2(1) );
11032 template<
typename MT3
11036 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11037 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11039 selectLargeAddAssignKernel( C, A, B, scalar );
11044 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 11058 template<
typename MT3
11062 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11063 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11065 using ET = ElementType_t<MT3>;
11067 if( IsTriangular_v<MT4> ) {
11068 ResultType_t<MT3> tmp(
serial( B ) );
11069 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11070 addAssign( C, tmp );
11072 else if( IsTriangular_v<MT5> ) {
11073 ResultType_t<MT3> tmp(
serial( A ) );
11074 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11075 addAssign( C, tmp );
11078 gemm( C, A, B,
ET(scalar),
ET(1) );
11100 template<
typename MT
11102 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11109 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11110 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11112 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11126 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11141 template<
typename MT3
11145 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11147 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
11148 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
11149 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
11150 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11151 selectSmallSubAssignKernel( C, A, B, scalar );
11153 selectBlasSubAssignKernel( C, A, B, scalar );
11171 template<
typename MT3
11175 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11176 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11179 subAssign( C, tmp );
11197 template<
typename MT3
11201 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11202 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11204 constexpr
size_t block( BLOCK_SIZE );
11206 const size_t M( A.rows() );
11207 const size_t N( B.columns() );
11209 for(
size_t ii=0UL; ii<M; ii+=block ) {
11210 const size_t iend(
min( M, ii+block ) );
11211 for(
size_t jj=0UL; jj<N; jj+=block ) {
11212 const size_t jend(
min( N, jj+block ) );
11213 for(
size_t i=ii; i<iend; ++i )
11215 const size_t jbegin( ( IsUpper_v<MT4> )
11216 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
11218 const size_t jpos( ( IsLower_v<MT4> )
11219 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
11222 for(
size_t j=jbegin; j<jpos; ++j ) {
11223 C(i,j) -= A(i,j) * B(j,j) * scalar;
11245 template<
typename MT3
11249 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11250 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11252 const size_t M( A.rows() );
11253 const size_t N( B.columns() );
11255 for(
size_t j=0UL; j<N; ++j )
11257 const size_t ibegin( ( IsLower_v<MT4> )
11258 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
11260 const size_t iend( ( IsUpper_v<MT4> )
11261 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
11265 const size_t inum( iend - ibegin );
11266 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
11268 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
11269 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
11270 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11272 if( ipos < iend ) {
11273 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11293 template<
typename MT3
11297 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11298 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11300 const size_t M( A.rows() );
11301 const size_t N( B.columns() );
11303 for(
size_t i=0UL; i<M; ++i )
11305 const size_t jbegin( ( IsUpper_v<MT5> )
11306 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
11308 const size_t jend( ( IsLower_v<MT5> )
11309 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
11313 const size_t jnum( jend - jbegin );
11314 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
11316 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
11317 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
11318 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11320 if( jpos < jend ) {
11321 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11341 template<
typename MT3
11345 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11346 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11348 constexpr
size_t block( BLOCK_SIZE );
11350 const size_t M( A.rows() );
11351 const size_t N( B.columns() );
11353 for(
size_t jj=0UL; jj<N; jj+=block ) {
11354 const size_t jend(
min( N, jj+block ) );
11355 for(
size_t ii=0UL; ii<M; ii+=block ) {
11356 const size_t iend(
min( M, ii+block ) );
11357 for(
size_t j=jj; j<jend; ++j )
11359 const size_t ibegin( ( IsLower_v<MT5> )
11360 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
11362 const size_t ipos( ( IsUpper_v<MT5> )
11363 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
11366 for(
size_t i=ibegin; i<ipos; ++i ) {
11367 C(i,j) -= A(i,i) * B(i,j) * scalar;
11389 template<
typename MT3
11393 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11394 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11396 for(
size_t i=0UL; i<A.rows(); ++i ) {
11397 C(i,i) -= A(i,i) * B(i,i) * scalar;
11416 template<
typename MT3
11420 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11421 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11423 selectDefaultSubAssignKernel( C, A, B, scalar );
11442 template<
typename MT3
11446 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11447 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11449 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
11451 const size_t M( A.rows() );
11452 const size_t N( B.columns() );
11453 const size_t K( A.columns() );
11457 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
11460 const SIMDType factor(
set( scalar ) );
11464 if( IsIntegral_v<ElementType> )
11467 for(
size_t i=0UL; i<M; ++i )
11469 const size_t kbegin( ( IsUpper_v<MT4> )
11470 ?( ( IsLower_v<MT5> )
11471 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11472 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11473 :( IsLower_v<MT5> ? j : 0UL ) );
11474 const size_t kend( ( IsLower_v<MT4> )
11475 ?( ( IsUpper_v<MT5> )
11476 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
11477 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
11478 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
11480 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11482 for(
size_t k=kbegin; k<kend; ++k ) {
11483 const SIMDType a1(
set( A(i,k) ) );
11484 xmm1 += a1 * B.load(k,j );
11485 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11486 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11487 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11488 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
11489 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
11490 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
11491 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
11494 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11510 for( ; (i+2UL) <= M; i+=2UL )
11512 const size_t kbegin( ( IsUpper_v<MT4> )
11513 ?( ( IsLower_v<MT5> )
11514 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11515 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11516 :( IsLower_v<MT5> ? j : 0UL ) );
11517 const size_t kend( ( IsLower_v<MT4> )
11518 ?( ( IsUpper_v<MT5> )
11519 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
11520 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11521 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
11523 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11525 for(
size_t k=kbegin; k<kend; ++k ) {
11526 const SIMDType a1(
set( A(i ,k) ) );
11527 const SIMDType a2(
set( A(i+1UL,k) ) );
11528 const SIMDType b1( B.load(k,j ) );
11529 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11530 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11531 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
11532 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
11545 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11550 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
11552 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
11553 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
11554 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
11559 const size_t kbegin( ( IsUpper_v<MT4> )
11560 ?( ( IsLower_v<MT5> )
11561 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11562 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11563 :( IsLower_v<MT5> ? j : 0UL ) );
11564 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
11566 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11568 for(
size_t k=kbegin; k<kend; ++k ) {
11569 const SIMDType a1(
set( A(i,k) ) );
11570 xmm1 += a1 * B.load(k,j );
11571 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11572 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11573 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11574 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
11577 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11589 for( ; (i+2UL) <= M; i+=2UL )
11591 const size_t kbegin( ( IsUpper_v<MT4> )
11592 ?( ( IsLower_v<MT5> )
11593 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11594 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11595 :( IsLower_v<MT5> ? j : 0UL ) );
11596 const size_t kend( ( IsLower_v<MT4> )
11597 ?( ( IsUpper_v<MT5> )
11598 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
11599 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11600 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
11602 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11604 for(
size_t k=kbegin; k<kend; ++k ) {
11605 const SIMDType a1(
set( A(i ,k) ) );
11606 const SIMDType a2(
set( A(i+1UL,k) ) );
11607 const SIMDType b1( B.load(k,j ) );
11608 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11609 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11610 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
11621 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11625 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
11627 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
11628 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
11633 const size_t kbegin( ( IsUpper_v<MT4> )
11634 ?( ( IsLower_v<MT5> )
11635 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11636 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11637 :( IsLower_v<MT5> ? j : 0UL ) );
11638 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
11640 SIMDType xmm1, xmm2, xmm3, xmm4;
11642 for(
size_t k=kbegin; k<kend; ++k ) {
11643 const SIMDType a1(
set( A(i,k) ) );
11644 xmm1 += a1 * B.load(k,j );
11645 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11646 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11647 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11650 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11661 for( ; (i+2UL) <= M; i+=2UL )
11663 const size_t kbegin( ( IsUpper_v<MT4> )
11664 ?( ( IsLower_v<MT5> )
11665 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11666 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11667 :( IsLower_v<MT5> ? j : 0UL ) );
11668 const size_t kend( ( IsLower_v<MT4> )
11669 ?( ( IsUpper_v<MT5> )
11670 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
11671 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11672 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
11674 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11676 for(
size_t k=kbegin; k<kend; ++k ) {
11677 const SIMDType a1(
set( A(i ,k) ) );
11678 const SIMDType a2(
set( A(i+1UL,k) ) );
11679 const SIMDType b1( B.load(k,j ) );
11680 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11681 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11690 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11693 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
11695 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
11700 const size_t kbegin( ( IsUpper_v<MT4> )
11701 ?( ( IsLower_v<MT5> )
11702 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11703 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11704 :( IsLower_v<MT5> ? j : 0UL ) );
11705 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
11707 SIMDType xmm1, xmm2, xmm3;
11709 for(
size_t k=kbegin; k<kend; ++k ) {
11710 const SIMDType a1(
set( A(i,k) ) );
11711 xmm1 += a1 * B.load(k,j );
11712 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11713 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11716 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11724 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
11725 size_t i( LOW ? j : 0UL );
11727 for( ; (i+4UL) <= iend; i+=4UL )
11729 const size_t kbegin( ( IsUpper_v<MT4> )
11730 ?( ( IsLower_v<MT5> )
11731 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11732 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11733 :( IsLower_v<MT5> ? j : 0UL ) );
11734 const size_t kend( ( IsLower_v<MT4> )
11735 ?( ( IsUpper_v<MT5> )
11736 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
11737 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
11738 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11740 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11742 for(
size_t k=kbegin; k<kend; ++k ) {
11743 const SIMDType a1(
set( A(i ,k) ) );
11744 const SIMDType a2(
set( A(i+1UL,k) ) );
11745 const SIMDType a3(
set( A(i+2UL,k) ) );
11746 const SIMDType a4(
set( A(i+3UL,k) ) );
11747 const SIMDType b1( B.load(k,j ) );
11748 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
11759 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11761 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
11763 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
11765 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
11769 for( ; (i+3UL) <= iend; i+=3UL )
11771 const size_t kbegin( ( IsUpper_v<MT4> )
11772 ?( ( IsLower_v<MT5> )
11773 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11774 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11775 :( IsLower_v<MT5> ? j : 0UL ) );
11776 const size_t kend( ( IsLower_v<MT4> )
11777 ?( ( IsUpper_v<MT5> )
11778 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
11779 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
11780 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11782 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11784 for(
size_t k=kbegin; k<kend; ++k ) {
11785 const SIMDType a1(
set( A(i ,k) ) );
11786 const SIMDType a2(
set( A(i+1UL,k) ) );
11787 const SIMDType a3(
set( A(i+2UL,k) ) );
11788 const SIMDType b1( B.load(k,j ) );
11789 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
11798 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11800 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
11802 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
11806 for( ; (i+2UL) <= iend; i+=2UL )
11808 const size_t kbegin( ( IsUpper_v<MT4> )
11809 ?( ( IsLower_v<MT5> )
11810 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11811 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11812 :( IsLower_v<MT5> ? j : 0UL ) );
11813 const size_t kend( ( IsLower_v<MT4> )
11814 ?( ( IsUpper_v<MT5> )
11815 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
11816 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11817 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11819 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11820 size_t k( kbegin );
11822 for( ; (k+2UL) <= kend; k+=2UL ) {
11823 const SIMDType a1(
set( A(i ,k ) ) );
11824 const SIMDType a2(
set( A(i+1UL,k ) ) );
11825 const SIMDType a3(
set( A(i ,k+1UL) ) );
11826 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
11827 const SIMDType b1( B.load(k ,j ) );
11828 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
11829 const SIMDType b3( B.load(k+1UL,j ) );
11830 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
11841 for( ; k<kend; ++k ) {
11842 const SIMDType a1(
set( A(i ,k) ) );
11843 const SIMDType a2(
set( A(i+1UL,k) ) );
11844 const SIMDType b1( B.load(k,j ) );
11845 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
11852 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
11854 C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
11855 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) - (xmm4+xmm8) * factor );
11860 const size_t kbegin( ( IsUpper_v<MT4> )
11861 ?( ( IsLower_v<MT5> )
11862 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11863 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11864 :( IsLower_v<MT5> ? j : 0UL ) );
11865 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
11867 SIMDType xmm1, xmm2, xmm3, xmm4;
11868 size_t k( kbegin );
11870 for( ; (k+2UL) <= kend; k+=2UL ) {
11871 const SIMDType a1(
set( A(i,k ) ) );
11872 const SIMDType a2(
set( A(i,k+1UL) ) );
11873 xmm1 += a1 * B.load(k ,j );
11874 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
11875 xmm3 += a2 * B.load(k+1UL,j );
11876 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
11879 for( ; k<kend; ++k ) {
11880 const SIMDType a1(
set( A(i,k) ) );
11881 xmm1 += a1 * B.load(k,j );
11882 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
11885 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
11892 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
11893 size_t i( LOW ? j : 0UL );
11895 for( ; (i+4UL) <= iend; i+=4UL )
11897 const size_t kbegin( ( IsUpper_v<MT4> )
11898 ?( ( IsLower_v<MT5> )
11899 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11900 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11901 :( IsLower_v<MT5> ? j : 0UL ) );
11902 const size_t kend( ( IsLower_v<MT4> )
11903 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
11906 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11907 size_t k( kbegin );
11909 for( ; (k+2UL) <= kend; k+=2UL ) {
11910 const SIMDType b1( B.load(k ,j) );
11911 const SIMDType b2( B.load(k+1UL,j) );
11912 xmm1 +=
set( A(i ,k ) ) * b1;
11913 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11914 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11915 xmm4 +=
set( A(i+3UL,k ) ) * b1;
11916 xmm5 +=
set( A(i ,k+1UL) ) * b2;
11917 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
11918 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
11919 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
11922 for( ; k<kend; ++k ) {
11923 const SIMDType b1( B.load(k,j) );
11924 xmm1 +=
set( A(i ,k) ) * b1;
11925 xmm2 +=
set( A(i+1UL,k) ) * b1;
11926 xmm3 +=
set( A(i+2UL,k) ) * b1;
11927 xmm4 +=
set( A(i+3UL,k) ) * b1;
11930 C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
11931 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
11932 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
11933 C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
11936 for( ; (i+3UL) <= iend; i+=3UL )
11938 const size_t kbegin( ( IsUpper_v<MT4> )
11939 ?( ( IsLower_v<MT5> )
11940 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11941 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11942 :( IsLower_v<MT5> ? j : 0UL ) );
11943 const size_t kend( ( IsLower_v<MT4> )
11944 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
11947 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11948 size_t k( kbegin );
11950 for( ; (k+2UL) <= kend; k+=2UL ) {
11951 const SIMDType b1( B.load(k ,j) );
11952 const SIMDType b2( B.load(k+1UL,j) );
11953 xmm1 +=
set( A(i ,k ) ) * b1;
11954 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11955 xmm3 +=
set( A(i+2UL,k ) ) * b1;
11956 xmm4 +=
set( A(i ,k+1UL) ) * b2;
11957 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
11958 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
11961 for( ; k<kend; ++k ) {
11962 const SIMDType b1( B.load(k,j) );
11963 xmm1 +=
set( A(i ,k) ) * b1;
11964 xmm2 +=
set( A(i+1UL,k) ) * b1;
11965 xmm3 +=
set( A(i+2UL,k) ) * b1;
11968 C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
11969 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
11970 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
11973 for( ; (i+2UL) <= iend; i+=2UL )
11975 const size_t kbegin( ( IsUpper_v<MT4> )
11976 ?( ( IsLower_v<MT5> )
11977 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11978 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11979 :( IsLower_v<MT5> ? j : 0UL ) );
11980 const size_t kend( ( IsLower_v<MT4> )
11981 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11984 SIMDType xmm1, xmm2, xmm3, xmm4;
11985 size_t k( kbegin );
11987 for( ; (k+2UL) <= kend; k+=2UL ) {
11988 const SIMDType b1( B.load(k ,j) );
11989 const SIMDType b2( B.load(k+1UL,j) );
11990 xmm1 +=
set( A(i ,k ) ) * b1;
11991 xmm2 +=
set( A(i+1UL,k ) ) * b1;
11992 xmm3 +=
set( A(i ,k+1UL) ) * b2;
11993 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
11996 for( ; k<kend; ++k ) {
11997 const SIMDType b1( B.load(k,j) );
11998 xmm1 +=
set( A(i ,k) ) * b1;
11999 xmm2 +=
set( A(i+1UL,k) ) * b1;
12002 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12003 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
12008 const size_t kbegin( ( IsUpper_v<MT4> )
12009 ?( ( IsLower_v<MT5> )
12010 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12011 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12012 :( IsLower_v<MT5> ? j : 0UL ) );
12014 SIMDType xmm1, xmm2;
12015 size_t k( kbegin );
12017 for( ; (k+2UL) <= K; k+=2UL ) {
12018 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
12019 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
12022 for( ; k<K; ++k ) {
12023 xmm1 +=
set( A(i,k) ) * B.load(k,j);
12026 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12030 for( ; remainder && j<N; ++j )
12032 const size_t iend( UPP ? j+1UL : M );
12033 size_t i( LOW ? j : 0UL );
12035 for( ; (i+2UL) <= iend; i+=2UL )
12037 const size_t kbegin( ( IsUpper_v<MT4> )
12038 ?( ( IsLower_v<MT5> )
12039 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12040 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12041 :( IsLower_v<MT5> ? j : 0UL ) );
12042 const size_t kend( ( IsLower_v<MT4> )
12043 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12049 for(
size_t k=kbegin; k<kend; ++k ) {
12050 value1 += A(i ,k) * B(k,j);
12051 value2 += A(i+1UL,k) * B(k,j);
12054 C(i ,j) -= value1 * scalar;
12055 C(i+1UL,j) -= value2 * scalar;
12060 const size_t kbegin( ( IsUpper_v<MT4> )
12061 ?( ( IsLower_v<MT5> )
12062 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12063 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12064 :( IsLower_v<MT5> ? j : 0UL ) );
12068 for(
size_t k=kbegin; k<K; ++k ) {
12069 value += A(i,k) * B(k,j);
12072 C(i,j) -= value * scalar;
12093 template<
typename MT3
12097 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12098 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12100 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
12102 const size_t M( A.rows() );
12103 const size_t N( B.columns() );
12104 const size_t K( A.columns() );
12108 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
12111 const SIMDType factor(
set( scalar ) );
12115 if( IsIntegral_v<ElementType> )
12118 for(
size_t j=0UL; j<N; ++j )
12120 const size_t kbegin( ( IsLower_v<MT5> )
12121 ?( ( IsUpper_v<MT4> )
12122 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12123 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12124 :( IsUpper_v<MT4> ? i : 0UL ) );
12125 const size_t kend( ( IsUpper_v<MT5> )
12126 ?( ( IsLower_v<MT4> )
12127 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
12128 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
12129 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
12131 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12133 for(
size_t k=kbegin; k<kend; ++k ) {
12134 const SIMDType b1(
set( B(k,j) ) );
12135 xmm1 += A.load(i ,k) * b1;
12136 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12137 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12138 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12139 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
12140 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
12141 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
12142 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
12145 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12161 for( ; (j+2UL) <= N; j+=2UL )
12163 const size_t kbegin( ( IsLower_v<MT5> )
12164 ?( ( IsUpper_v<MT4> )
12165 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12166 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12167 :( IsUpper_v<MT4> ? i : 0UL ) );
12168 const size_t kend( ( IsUpper_v<MT5> )
12169 ?( ( IsLower_v<MT4> )
12170 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12171 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12172 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
12174 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12176 for(
size_t k=kbegin; k<kend; ++k ) {
12177 const SIMDType a1( A.load(i ,k) );
12178 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12179 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12180 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
12181 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
12182 const SIMDType b1(
set( B(k,j ) ) );
12183 const SIMDType b2(
set( B(k,j+1UL) ) );
12196 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12201 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
12203 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12204 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12205 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12210 const size_t kbegin( ( IsLower_v<MT5> )
12211 ?( ( IsUpper_v<MT4> )
12212 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12213 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12214 :( IsUpper_v<MT4> ? i : 0UL ) );
12215 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
12217 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12219 for(
size_t k=kbegin; k<kend; ++k ) {
12220 const SIMDType b1(
set( B(k,j) ) );
12221 xmm1 += A.load(i ,k) * b1;
12222 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12223 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12224 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12225 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
12228 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12240 for( ; (j+2UL) <= N; j+=2UL )
12242 const size_t kbegin( ( IsLower_v<MT5> )
12243 ?( ( IsUpper_v<MT4> )
12244 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12245 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12246 :( IsUpper_v<MT4> ? i : 0UL ) );
12247 const size_t kend( ( IsUpper_v<MT5> )
12248 ?( ( IsLower_v<MT4> )
12249 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12250 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12251 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
12253 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12255 for(
size_t k=kbegin; k<kend; ++k ) {
12256 const SIMDType a1( A.load(i ,k) );
12257 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12258 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12259 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
12260 const SIMDType b1(
set( B(k,j ) ) );
12261 const SIMDType b2(
set( B(k,j+1UL) ) );
12272 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12276 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
12278 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12279 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12284 const size_t kbegin( ( IsLower_v<MT5> )
12285 ?( ( IsUpper_v<MT4> )
12286 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12287 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12288 :( IsUpper_v<MT4> ? i : 0UL ) );
12289 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
12291 SIMDType xmm1, xmm2, xmm3, xmm4;
12293 for(
size_t k=kbegin; k<kend; ++k ) {
12294 const SIMDType b1(
set( B(k,j) ) );
12295 xmm1 += A.load(i ,k) * b1;
12296 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12297 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12298 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12301 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12312 for( ; (j+2UL) <= N; j+=2UL )
12314 const size_t kbegin( ( IsLower_v<MT5> )
12315 ?( ( IsUpper_v<MT4> )
12316 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12317 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12318 :( IsUpper_v<MT4> ? i : 0UL ) );
12319 const size_t kend( ( IsUpper_v<MT5> )
12320 ?( ( IsLower_v<MT4> )
12321 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12322 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12323 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
12325 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12327 for(
size_t k=kbegin; k<kend; ++k ) {
12328 const SIMDType a1( A.load(i ,k) );
12329 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12330 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12331 const SIMDType b1(
set( B(k,j ) ) );
12332 const SIMDType b2(
set( B(k,j+1UL) ) );
12341 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12344 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
12346 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12351 const size_t kbegin( ( IsLower_v<MT5> )
12352 ?( ( IsUpper_v<MT4> )
12353 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12354 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12355 :( IsUpper_v<MT4> ? i : 0UL ) );
12356 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
12358 SIMDType xmm1, xmm2, xmm3;
12360 for(
size_t k=kbegin; k<kend; ++k ) {
12361 const SIMDType b1(
set( B(k,j) ) );
12362 xmm1 += A.load(i ,k) * b1;
12363 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12364 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12367 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12375 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
12376 size_t j( UPP ? i : 0UL );
12378 for( ; (j+4UL) <= jend; j+=4UL )
12380 const size_t kbegin( ( IsLower_v<MT5> )
12381 ?( ( IsUpper_v<MT4> )
12382 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12383 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12384 :( IsUpper_v<MT4> ? i : 0UL ) );
12385 const size_t kend( ( IsUpper_v<MT5> )
12386 ?( ( IsLower_v<MT4> )
12387 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
12388 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
12389 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12391 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12393 for(
size_t k=kbegin; k<kend; ++k ) {
12394 const SIMDType a1( A.load(i ,k) );
12395 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12396 const SIMDType b1(
set( B(k,j ) ) );
12397 const SIMDType b2(
set( B(k,j+1UL) ) );
12398 const SIMDType b3(
set( B(k,j+2UL) ) );
12399 const SIMDType b4(
set( B(k,j+3UL) ) );
12410 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12412 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12414 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12416 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
12420 for( ; (j+3UL) <= jend; j+=3UL )
12422 const size_t kbegin( ( IsLower_v<MT5> )
12423 ?( ( IsUpper_v<MT4> )
12424 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12425 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12426 :( IsUpper_v<MT4> ? i : 0UL ) );
12427 const size_t kend( ( IsUpper_v<MT5> )
12428 ?( ( IsLower_v<MT4> )
12429 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
12430 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
12431 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12433 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12435 for(
size_t k=kbegin; k<kend; ++k ) {
12436 const SIMDType a1( A.load(i ,k) );
12437 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12438 const SIMDType b1(
set( B(k,j ) ) );
12439 const SIMDType b2(
set( B(k,j+1UL) ) );
12440 const SIMDType b3(
set( B(k,j+2UL) ) );
12449 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12451 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12453 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12457 for( ; (j+2UL) <= jend; j+=2UL )
12459 const size_t kbegin( ( IsLower_v<MT5> )
12460 ?( ( IsUpper_v<MT4> )
12461 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12462 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12463 :( IsUpper_v<MT4> ? i : 0UL ) );
12464 const size_t kend( ( IsUpper_v<MT5> )
12465 ?( ( IsLower_v<MT4> )
12466 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12467 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12468 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12470 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12471 size_t k( kbegin );
12473 for( ; (k+2UL) <= kend; k+=2UL ) {
12474 const SIMDType a1( A.load(i ,k ) );
12475 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
12476 const SIMDType a3( A.load(i ,k+1UL) );
12477 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
12478 const SIMDType b1(
set( B(k ,j ) ) );
12479 const SIMDType b2(
set( B(k ,j+1UL) ) );
12480 const SIMDType b3(
set( B(k+1UL,j ) ) );
12481 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
12492 for( ; k<kend; ++k ) {
12493 const SIMDType a1( A.load(i ,k) );
12494 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12495 const SIMDType b1(
set( B(k,j ) ) );
12496 const SIMDType b2(
set( B(k,j+1UL) ) );
12503 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12505 C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
12506 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12511 const size_t kbegin( ( IsLower_v<MT5> )
12512 ?( ( IsUpper_v<MT4> )
12513 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12514 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12515 :( IsUpper_v<MT4> ? i : 0UL ) );
12516 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
12518 SIMDType xmm1, xmm2, xmm3, xmm4;
12519 size_t k( kbegin );
12521 for( ; (k+2UL) <= kend; k+=2UL ) {
12522 const SIMDType b1(
set( B(k ,j) ) );
12523 const SIMDType b2(
set( B(k+1UL,j) ) );
12524 xmm1 += A.load(i ,k ) * b1;
12525 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
12526 xmm3 += A.load(i ,k+1UL) * b2;
12527 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
12530 for( ; k<kend; ++k ) {
12531 const SIMDType b1(
set( B(k,j) ) );
12532 xmm1 += A.load(i ,k) * b1;
12533 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
12536 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12543 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
12544 size_t j( UPP ? i : 0UL );
12546 for( ; (j+4UL) <= jend; j+=4UL )
12548 const size_t kbegin( ( IsLower_v<MT5> )
12549 ?( ( IsUpper_v<MT4> )
12550 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12551 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12552 :( IsUpper_v<MT4> ? i : 0UL ) );
12553 const size_t kend( ( IsUpper_v<MT5> )
12554 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12557 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12558 size_t k( kbegin );
12560 for( ; (k+2UL) <= kend; k+=2UL ) {
12561 const SIMDType a1( A.load(i,k ) );
12562 const SIMDType a2( A.load(i,k+1UL) );
12563 xmm1 += a1 *
set( B(k ,j ) );
12564 xmm2 += a1 *
set( B(k ,j+1UL) );
12565 xmm3 += a1 *
set( B(k ,j+2UL) );
12566 xmm4 += a1 *
set( B(k ,j+3UL) );
12567 xmm5 += a2 *
set( B(k+1UL,j ) );
12568 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
12569 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
12570 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
12573 for( ; k<kend; ++k ) {
12574 const SIMDType a1( A.load(i,k) );
12575 xmm1 += a1 *
set( B(k,j ) );
12576 xmm2 += a1 *
set( B(k,j+1UL) );
12577 xmm3 += a1 *
set( B(k,j+2UL) );
12578 xmm4 += a1 *
set( B(k,j+3UL) );
12581 C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
12582 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
12583 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
12584 C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
12587 for( ; (j+3UL) <= jend; j+=3UL )
12589 const size_t kbegin( ( IsLower_v<MT5> )
12590 ?( ( IsUpper_v<MT4> )
12591 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12592 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12593 :( IsUpper_v<MT4> ? i : 0UL ) );
12594 const size_t kend( ( IsUpper_v<MT5> )
12595 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12598 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12599 size_t k( kbegin );
12601 for( ; (k+2UL) <= kend; k+=2UL ) {
12602 const SIMDType a1( A.load(i,k ) );
12603 const SIMDType a2( A.load(i,k+1UL) );
12604 xmm1 += a1 *
set( B(k ,j ) );
12605 xmm2 += a1 *
set( B(k ,j+1UL) );
12606 xmm3 += a1 *
set( B(k ,j+2UL) );
12607 xmm4 += a2 *
set( B(k+1UL,j ) );
12608 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
12609 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
12612 for( ; k<kend; ++k ) {
12613 const SIMDType a1( A.load(i,k) );
12614 xmm1 += a1 *
set( B(k,j ) );
12615 xmm2 += a1 *
set( B(k,j+1UL) );
12616 xmm3 += a1 *
set( B(k,j+2UL) );
12619 C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
12620 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
12621 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
12624 for( ; (j+2UL) <= jend; j+=2UL )
12626 const size_t kbegin( ( IsLower_v<MT5> )
12627 ?( ( IsUpper_v<MT4> )
12628 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12629 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12630 :( IsUpper_v<MT4> ? i : 0UL ) );
12631 const size_t kend( ( IsUpper_v<MT5> )
12632 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12635 SIMDType xmm1, xmm2, xmm3, xmm4;
12636 size_t k( kbegin );
12638 for( ; (k+2UL) <= kend; k+=2UL ) {
12639 const SIMDType a1( A.load(i,k ) );
12640 const SIMDType a2( A.load(i,k+1UL) );
12641 xmm1 += a1 *
set( B(k ,j ) );
12642 xmm2 += a1 *
set( B(k ,j+1UL) );
12643 xmm3 += a2 *
set( B(k+1UL,j ) );
12644 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
12647 for( ; k<kend; ++k ) {
12648 const SIMDType a1( A.load(i,k) );
12649 xmm1 += a1 *
set( B(k,j ) );
12650 xmm2 += a1 *
set( B(k,j+1UL) );
12653 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
12654 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
12659 const size_t kbegin( ( IsLower_v<MT5> )
12660 ?( ( IsUpper_v<MT4> )
12661 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12662 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12663 :( IsUpper_v<MT4> ? i : 0UL ) );
12665 SIMDType xmm1, xmm2;
12666 size_t k( kbegin );
12668 for( ; (k+2UL) <= K; k+=2UL ) {
12669 xmm1 += A.load(i,k ) *
set( B(k ,j) );
12670 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
12673 for( ; k<K; ++k ) {
12674 xmm1 += A.load(i,k) *
set( B(k,j) );
12677 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12681 for( ; remainder && i<M; ++i )
12683 const size_t jend( LOW ? i+1UL : N );
12684 size_t j( UPP ? i : 0UL );
12686 for( ; (j+2UL) <= jend; j+=2UL )
12688 const size_t kbegin( ( IsLower_v<MT5> )
12689 ?( ( IsUpper_v<MT4> )
12690 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12691 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12692 :( IsUpper_v<MT4> ? i : 0UL ) );
12693 const size_t kend( ( IsUpper_v<MT5> )
12694 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12700 for(
size_t k=kbegin; k<kend; ++k ) {
12701 value1 += A(i,k) * B(k,j );
12702 value2 += A(i,k) * B(k,j+1UL);
12705 C(i,j ) -= value1 * scalar;
12706 C(i,j+1UL) -= value2 * scalar;
12711 const size_t kbegin( ( IsLower_v<MT5> )
12712 ?( ( IsUpper_v<MT4> )
12713 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12714 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12715 :( IsUpper_v<MT4> ? i : 0UL ) );
12719 for(
size_t k=kbegin; k<K; ++k ) {
12720 value += A(i,k) * B(k,j);
12723 C(i,j) -= value * scalar;
12743 template<
typename MT3
12747 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12748 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12750 selectDefaultSubAssignKernel( C, A, B, scalar );
12769 template<
typename MT3
12773 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12774 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12777 lmmm( C, A, B, -scalar, ST2(1) );
12779 ummm( C, A, B, -scalar, ST2(1) );
12781 mmm( C, A, B, -scalar, ST2(1) );
12799 template<
typename MT3
12803 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12804 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12806 selectLargeSubAssignKernel( C, A, B, scalar );
12811 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 12825 template<
typename MT3
12829 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12830 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12832 using ET = ElementType_t<MT3>;
12834 if( IsTriangular_v<MT4> ) {
12835 ResultType_t<MT3> tmp(
serial( B ) );
12836 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12837 subAssign( C, tmp );
12839 else if( IsTriangular_v<MT5> ) {
12840 ResultType_t<MT3> tmp(
serial( A ) );
12841 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12842 subAssign( C, tmp );
12845 gemm( C, A, B,
ET(-scalar),
ET(1) );
12867 template<
typename MT
12869 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
12881 schurAssign( ~lhs, tmp );
12912 template<
typename MT
12915 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12922 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12923 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12925 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
12928 else if( left.columns() == 0UL ) {
12943 smpAssign( ~lhs, A * B * rhs.scalar_ );
12962 template<
typename MT
12965 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12969 using TmpType = If_t< SO, ResultType, OppositeType >;
12981 const ForwardFunctor fwd;
12983 const TmpType tmp( rhs );
13003 template<
typename MT
13006 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13013 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13014 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13016 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13053 template<
typename MT
13056 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13063 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13064 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13066 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13100 template<
typename MT
13183 template<
typename MT1
13185 inline decltype(
auto)
13233 template<
typename MT1
13239 inline decltype(
auto)
declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13247 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13248 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13279 template<
typename MT1
13285 inline decltype(
auto)
declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13293 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13294 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13325 template<
typename MT1
13331 inline decltype(
auto)
decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13339 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13340 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13371 template<
typename MT1
13377 inline decltype(
auto)
declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13385 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13386 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13417 template<
typename MT1
13423 inline decltype(
auto)
decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13431 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13432 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13448 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13449 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13450 :
public Size<MT1,0UL>
13453 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13454 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13455 :
public Size<MT2,1UL>
13471 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13472 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13473 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:167
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:273
Header file for basic type definitions.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:162
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:266
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:479
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:271
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:270
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:412
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:422
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:174
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:173
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:172
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:402
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
Header file for the IsComplexDouble type trait.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:157
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:157
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.The IsSIMDCombinable_v variable templ...
Definition: IsSIMDCombinable.h:137
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:290
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:434
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.The HasSIMDMult_v variable template provid...
Definition: HasSIMDMult.h:189
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:272
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:392
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:466
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:446
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:376
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:285
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:297
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:312
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:171
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:456
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:327
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.The HasSIMDAdd_v variable template provides...
Definition: HasSIMDAdd.h:188
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:269
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:279
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:303
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:282
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:276
Header file for the IsExpression type trait class.
Header file for the function trace functionality.