35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_ 139 template<
typename MT1
145 class TDMatDMatMultExpr
146 :
public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
147 ,
private Computation
161 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
166 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
170 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
171 static constexpr
bool HERM = ( HF && !( LF || UF ) );
172 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
173 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
182 template<
typename T1,
typename T2,
typename T3 >
192 template<
typename T1,
typename T2,
typename T3 >
193 static constexpr
bool UseBlasKernel_v =
196 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
197 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
198 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
199 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
200 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
201 IsBLASCompatible_v< ElementType_t<T1> > &&
202 IsBLASCompatible_v< ElementType_t<T2> > &&
203 IsBLASCompatible_v< ElementType_t<T3> > &&
214 template<
typename T1,
typename T2,
typename T3 >
215 static constexpr
bool UseVectorizedDefaultKernel_v =
216 ( useOptimizedKernels &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsSIMDCombinable_v< ElementType_t<T1>
290 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
291 MT1::simdEnabled && MT2::simdEnabled &&
292 HasSIMDAdd_v<ET1,ET2> &&
293 HasSIMDMult_v<ET1,ET2> );
330 if( IsDiagonal_v<MT1> ) {
333 else if( IsDiagonal_v<MT2> ) {
336 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
337 const size_t begin( ( IsUpper_v<MT1> )
338 ?( ( IsLower_v<MT2> )
339 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
340 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
341 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
342 :( ( IsLower_v<MT2> )
343 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
345 const size_t end( ( IsLower_v<MT1> )
346 ?( ( IsUpper_v<MT2> )
347 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
348 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
349 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
350 :( ( IsUpper_v<MT2> )
351 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
352 :(
lhs_.columns() ) ) );
376 if( i >=
lhs_.rows() ) {
379 if( j >=
rhs_.columns() ) {
391 inline size_t rows() const noexcept {
402 return rhs_.columns();
432 template<
typename T >
433 inline bool canAlias(
const T* alias )
const noexcept {
434 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
444 template<
typename T >
445 inline bool isAliased(
const T* alias )
const noexcept {
446 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
456 return lhs_.isAligned() &&
rhs_.isAligned();
467 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
469 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
470 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
471 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
494 template<
typename MT
503 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
506 else if( rhs.lhs_.columns() == 0UL ) {
521 TDMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
537 template<
typename MT3
540 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
542 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
543 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
544 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
545 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
546 selectSmallAssignKernel( C, A, B );
548 selectBlasAssignKernel( C, A, B );
567 template<
typename MT3
570 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
571 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
573 const size_t M( A.rows() );
574 const size_t N( B.columns() );
575 const size_t K( A.columns() );
579 for(
size_t i=0UL; i<M; ++i )
581 const size_t kbegin( ( IsUpper_v<MT4> )
582 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
584 const size_t kend( ( IsLower_v<MT4> )
585 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
589 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
590 for(
size_t j=0UL; j<N; ++j ) {
597 const size_t jbegin( ( IsUpper_v<MT5> )
598 ?( ( IsStrictlyUpper_v<MT5> )
599 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
600 :(
UPP ?
max(i,kbegin) : kbegin ) )
601 :(
UPP ? i : 0UL ) );
602 const size_t jend( ( IsLower_v<MT5> )
603 ?( ( IsStrictlyLower_v<MT5> )
604 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
605 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
606 :(
LOW ? i+1UL : N ) );
608 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
609 for(
size_t j=0UL; j<jbegin; ++j ) {
613 else if( IsStrictlyUpper_v<MT5> ) {
616 for(
size_t j=jbegin; j<jend; ++j ) {
617 C(i,j) = A(i,kbegin) * B(kbegin,j);
619 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
620 for(
size_t j=jend; j<N; ++j ) {
624 else if( IsStrictlyLower_v<MT5> ) {
629 for(
size_t k=kbegin+1UL; k<kend; ++k )
631 const size_t jbegin( ( IsUpper_v<MT5> )
632 ?( ( IsStrictlyUpper_v<MT5> )
636 const size_t jend( ( IsLower_v<MT5> )
637 ?( ( IsStrictlyLower_v<MT5> )
638 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
639 :(
LOW ?
min(i+1UL,k) : k ) )
640 :(
LOW ? i+1UL : N ) );
642 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
645 for(
size_t j=jbegin; j<jend; ++j ) {
646 C(i,j) += A(i,k) * B(k,j);
648 if( IsLower_v<MT5> ) {
649 C(i,jend) = A(i,k) * B(k,jend);
655 for(
size_t i=1UL; i<M; ++i ) {
656 for(
size_t j=0UL; j<i; ++j ) {
657 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
679 template<
typename MT3
682 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
683 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
685 const size_t M( A.rows() );
686 const size_t N( B.columns() );
687 const size_t K( A.columns() );
691 for(
size_t j=0UL; j<N; ++j )
693 const size_t kbegin( ( IsLower_v<MT5> )
694 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
696 const size_t kend( ( IsUpper_v<MT5> )
697 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
701 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
702 for(
size_t i=0UL; i<M; ++i ) {
709 const size_t ibegin( ( IsLower_v<MT4> )
710 ?( ( IsStrictlyLower_v<MT4> )
711 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
712 :(
LOW ?
max(j,kbegin) : kbegin ) )
713 :(
LOW ? j : 0UL ) );
714 const size_t iend( ( IsUpper_v<MT4> )
715 ?( ( IsStrictlyUpper_v<MT4> )
716 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
717 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
718 :(
UPP ? j+1UL : M ) );
720 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
721 for(
size_t i=0UL; i<ibegin; ++i ) {
725 else if( IsStrictlyLower_v<MT4> ) {
728 for(
size_t i=ibegin; i<iend; ++i ) {
729 C(i,j) = A(i,kbegin) * B(kbegin,j);
731 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
732 for(
size_t i=iend; i<M; ++i ) {
736 else if( IsStrictlyUpper_v<MT4> ) {
741 for(
size_t k=kbegin+1UL; k<kend; ++k )
743 const size_t ibegin( ( IsLower_v<MT4> )
744 ?( ( IsStrictlyLower_v<MT4> )
748 const size_t iend( ( IsUpper_v<MT4> )
749 ?( ( IsStrictlyUpper_v<MT4> )
750 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
751 :(
UPP ?
min(j+1UL,k) : k ) )
752 :(
UPP ? j+1UL : M ) );
754 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
757 for(
size_t i=ibegin; i<iend; ++i ) {
758 C(i,j) += A(i,k) * B(k,j);
760 if( IsUpper_v<MT4> ) {
761 C(iend,j) = A(iend,k) * B(k,j);
767 for(
size_t j=1UL; j<N; ++j ) {
768 for(
size_t i=0UL; i<j; ++i ) {
769 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
791 template<
typename MT3
794 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797 constexpr
size_t block( BLOCK_SIZE );
799 const size_t M( A.rows() );
800 const size_t N( B.columns() );
802 for(
size_t ii=0UL; ii<M; ii+=block ) {
803 const size_t iend(
min( M, ii+block ) );
804 for(
size_t jj=0UL; jj<N; jj+=block ) {
805 const size_t jend(
min( N, jj+block ) );
806 for(
size_t i=ii; i<iend; ++i )
808 const size_t jbegin( ( IsUpper_v<MT4> )
809 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
811 const size_t jpos( ( IsLower_v<MT4> )
812 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
815 if( IsUpper_v<MT4> ) {
816 for(
size_t j=jj; j<jbegin; ++j ) {
820 for(
size_t j=jbegin; j<jpos; ++j ) {
821 C(i,j) = A(i,j) * B(j,j);
823 if( IsLower_v<MT4> ) {
824 for(
size_t j=jpos; j<jend; ++j ) {
849 template<
typename MT3
852 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
853 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
855 const size_t M( A.rows() );
856 const size_t N( B.columns() );
858 for(
size_t j=0UL; j<N; ++j )
860 const size_t ibegin( ( IsLower_v<MT4> )
861 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
863 const size_t iend( ( IsUpper_v<MT4> )
864 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
868 if( IsLower_v<MT4> ) {
869 for(
size_t i=0UL; i<ibegin; ++i ) {
873 for(
size_t i=ibegin; i<iend; ++i ) {
874 C(i,j) = A(i,j) * B(j,j);
876 if( IsUpper_v<MT4> ) {
877 for(
size_t i=iend; i<M; ++i ) {
900 template<
typename MT3
903 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906 const size_t M( A.rows() );
907 const size_t N( B.columns() );
909 for(
size_t i=0UL; i<M; ++i )
911 const size_t jbegin( ( IsUpper_v<MT5> )
912 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
914 const size_t jend( ( IsLower_v<MT5> )
915 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
919 if( IsUpper_v<MT5> ) {
920 for(
size_t j=0UL; j<jbegin; ++j ) {
924 for(
size_t j=jbegin; j<jend; ++j ) {
925 C(i,j) = A(i,i) * B(i,j);
927 if( IsLower_v<MT5> ) {
928 for(
size_t j=jend; j<N; ++j ) {
951 template<
typename MT3
954 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
955 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
957 constexpr
size_t block( BLOCK_SIZE );
959 const size_t M( A.rows() );
960 const size_t N( B.columns() );
962 for(
size_t jj=0UL; jj<N; jj+=block ) {
963 const size_t jend(
min( N, jj+block ) );
964 for(
size_t ii=0UL; ii<M; ii+=block ) {
965 const size_t iend(
min( M, ii+block ) );
966 for(
size_t j=jj; j<jend; ++j )
968 const size_t ibegin( ( IsLower_v<MT5> )
969 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
971 const size_t ipos( ( IsUpper_v<MT5> )
972 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
975 if( IsLower_v<MT5> ) {
976 for(
size_t i=ii; i<ibegin; ++i ) {
980 for(
size_t i=ibegin; i<ipos; ++i ) {
981 C(i,j) = A(i,i) * B(i,j);
983 if( IsUpper_v<MT5> ) {
984 for(
size_t i=ipos; i<iend; ++i ) {
1009 template<
typename MT3
1012 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1017 for(
size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1038 template<
typename MT3
1041 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044 selectDefaultAssignKernel( C, A, B );
1064 template<
typename MT3
1067 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1078 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1083 if( IsIntegral_v<ElementType> )
1086 for(
size_t i=0UL; i<M; ++i )
1088 const size_t kbegin( ( IsUpper_v<MT4> )
1089 ?( ( IsLower_v<MT5> )
1090 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1091 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1092 :( IsLower_v<MT5> ? j : 0UL ) );
1093 const size_t kend( ( IsLower_v<MT4> )
1094 ?( ( IsUpper_v<MT5> )
1095 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
1096 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1097 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
1099 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1101 for(
size_t k=kbegin; k<kend; ++k ) {
1103 xmm1 += a1 * B.load(k,j );
1104 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1105 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1106 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1107 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1108 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
1109 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
1110 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
1113 C.store( i, j , xmm1 );
1115 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1116 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1117 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1118 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
1119 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
1120 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
1129 for( ; (i+2UL) <= M; i+=2UL )
1131 const size_t kbegin( ( IsUpper_v<MT4> )
1132 ?( ( IsLower_v<MT5> )
1133 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1134 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1135 :( IsLower_v<MT5> ? j : 0UL ) );
1136 const size_t kend( ( IsLower_v<MT4> )
1137 ?( ( IsUpper_v<MT5> )
1138 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
1139 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1140 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
1142 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1144 for(
size_t k=kbegin; k<kend; ++k ) {
1164 C.store( i , j , xmm1 );
1166 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1167 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1168 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
1169 C.store( i+1UL, j , xmm6 );
1170 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
1171 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
1172 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
1173 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
1178 const size_t kbegin( ( IsUpper_v<MT4> )
1179 ?( ( IsLower_v<MT5> )
1180 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1181 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1182 :( IsLower_v<MT5> ? j : 0UL ) );
1183 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
1185 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1187 for(
size_t k=kbegin; k<kend; ++k ) {
1189 xmm1 += a1 * B.load(k,j );
1190 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1191 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1192 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1193 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1196 C.store( i, j , xmm1 );
1198 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1199 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1200 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1212 for(
size_t jj=j; jj<jjend; ++jj ) {
1213 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1220 for(
size_t jj=j; jj<jjend; ++jj ) {
1226 for( ; (i+2UL) <= iend; i+=2UL )
1228 const size_t kbegin( ( IsUpper_v<MT4> )
1229 ?( ( IsLower_v<MT5> )
1230 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1231 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1232 :( IsLower_v<MT5> ? j : 0UL ) );
1233 const size_t kend( ( IsLower_v<MT4> )
1234 ?( ( IsUpper_v<MT5> )
1235 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1236 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1237 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1239 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1241 for(
size_t k=kbegin; k<kend; ++k ) {
1258 C.store( i , j , xmm1 );
1260 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1261 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1262 C.store( i+1UL, j , xmm5 );
1263 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1264 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1265 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1270 const size_t kbegin( ( IsUpper_v<MT4> )
1271 ?( ( IsLower_v<MT5> )
1272 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1273 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1274 :( IsLower_v<MT5> ? j : 0UL ) );
1275 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1279 for(
size_t k=kbegin; k<kend; ++k ) {
1281 xmm1 += a1 * B.load(k,j );
1282 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1283 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1284 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1287 C.store( i, j , xmm1 );
1289 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1290 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1298 for(
size_t jj=j; jj<jjend; ++jj ) {
1313 for(
size_t jj=j; jj<jjend; ++jj ) {
1314 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1321 for(
size_t jj=j; jj<jjend; ++jj ) {
1327 for( ; (i+2UL) <= iend; i+=2UL )
1329 const size_t kbegin( ( IsUpper_v<MT4> )
1330 ?( ( IsLower_v<MT5> )
1331 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1332 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1333 :( IsLower_v<MT5> ? j : 0UL ) );
1334 const size_t kend( ( IsLower_v<MT4> )
1335 ?( ( IsUpper_v<MT5> )
1336 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1337 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1338 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1340 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1342 for(
size_t k=kbegin; k<kend; ++k ) {
1356 C.store( i , j , xmm1 );
1358 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1359 C.store( i+1UL, j , xmm4 );
1360 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1361 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1366 const size_t kbegin( ( IsUpper_v<MT4> )
1367 ?( ( IsLower_v<MT5> )
1368 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1369 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1370 :( IsLower_v<MT5> ? j : 0UL ) );
1371 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1375 for(
size_t k=kbegin; k<kend; ++k ) {
1377 xmm1 += a1 * B.load(k,j );
1378 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1379 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1382 C.store( i, j , xmm1 );
1384 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1392 for(
size_t jj=j; jj<jjend; ++jj ) {
1407 for(
size_t jj=j; jj<jjend; ++jj ) {
1408 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1415 for(
size_t jj=j; jj<jjend; ++jj ) {
1421 for( ; (i+4UL) <= iend; i+=4UL )
1423 const size_t kbegin( ( IsUpper_v<MT4> )
1424 ?( ( IsLower_v<MT5> )
1425 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1426 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1427 :( IsLower_v<MT5> ? j : 0UL ) );
1428 const size_t kend( ( IsLower_v<MT4> )
1429 ?( ( IsUpper_v<MT5> )
1430 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1431 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1432 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1434 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1436 for(
size_t k=kbegin; k<kend; ++k ) {
1453 C.store( i , j , xmm1 );
1455 C.store( i+1UL, j , xmm3 );
1456 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1457 C.store( i+2UL, j , xmm5 );
1458 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1459 C.store( i+3UL, j , xmm7 );
1460 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1463 for( ; (i+3UL) <= iend; i+=3UL )
1465 const size_t kbegin( ( IsUpper_v<MT4> )
1466 ?( ( IsLower_v<MT5> )
1467 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1468 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1469 :( IsLower_v<MT5> ? j : 0UL ) );
1470 const size_t kend( ( IsLower_v<MT4> )
1471 ?( ( IsUpper_v<MT5> )
1472 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1473 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1474 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1476 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1478 for(
size_t k=kbegin; k<kend; ++k ) {
1492 C.store( i , j , xmm1 );
1494 C.store( i+1UL, j , xmm3 );
1495 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1496 C.store( i+2UL, j , xmm5 );
1497 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1500 for( ; (i+2UL) <= iend; i+=2UL )
1502 const size_t kbegin( ( IsUpper_v<MT4> )
1503 ?( ( IsLower_v<MT5> )
1504 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1505 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1506 :( IsLower_v<MT5> ? j : 0UL ) );
1507 const size_t kend( ( IsLower_v<MT4> )
1508 ?( ( IsUpper_v<MT5> )
1509 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1510 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1511 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1513 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1516 for( ; (k+2UL) <= kend; k+=2UL ) {
1521 const SIMDType b1( B.load(k ,j ) );
1523 const SIMDType b3( B.load(k+1UL,j ) );
1535 for( ; k<kend; ++k ) {
1546 C.store( i , j , xmm1+xmm5 );
1547 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
1548 C.store( i+1UL, j , xmm3+xmm7 );
1549 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
1554 const size_t kbegin( ( IsUpper_v<MT4> )
1555 ?( ( IsLower_v<MT5> )
1556 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1557 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1558 :( IsLower_v<MT5> ? j : 0UL ) );
1559 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1564 for( ; (k+2UL) <= kend; k+=2UL ) {
1567 xmm1 += a1 * B.load(k ,j );
1568 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
1569 xmm3 += a2 * B.load(k+1UL,j );
1570 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
1573 for( ; k<kend; ++k ) {
1575 xmm1 += a1 * B.load(k,j );
1579 C.store( i, j , xmm1+xmm3 );
1580 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
1588 for(
size_t jj=j; jj<jjend; ++jj ) {
1603 for(
size_t jj=j; jj<jjend; ++jj ) {
1604 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1611 for(
size_t jj=j; jj<jjend; ++jj ) {
1617 for( ; (i+4UL) <= iend; i+=4UL )
1619 const size_t kbegin( ( IsUpper_v<MT4> )
1620 ?( ( IsLower_v<MT5> )
1621 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1622 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1623 :( IsLower_v<MT5> ? j : 0UL ) );
1624 const size_t kend( ( IsLower_v<MT4> )
1625 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1628 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1631 for( ; (k+2UL) <= kend; k+=2UL ) {
1633 const SIMDType b2( B.load(k+1UL,j) );
1634 xmm1 +=
set( A(i ,k ) ) * b1;
1635 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1636 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1637 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1638 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1639 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1640 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1641 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1644 for( ; k<kend; ++k ) {
1646 xmm1 +=
set( A(i ,k) ) * b1;
1647 xmm2 +=
set( A(i+1UL,k) ) * b1;
1648 xmm3 +=
set( A(i+2UL,k) ) * b1;
1649 xmm4 +=
set( A(i+3UL,k) ) * b1;
1652 C.store( i , j, xmm1+xmm5 );
1653 C.store( i+1UL, j, xmm2+xmm6 );
1654 C.store( i+2UL, j, xmm3+xmm7 );
1655 C.store( i+3UL, j, xmm4+xmm8 );
1658 for( ; (i+3UL) <= iend; i+=3UL )
1660 const size_t kbegin( ( IsUpper_v<MT4> )
1661 ?( ( IsLower_v<MT5> )
1662 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1663 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1664 :( IsLower_v<MT5> ? j : 0UL ) );
1665 const size_t kend( ( IsLower_v<MT4> )
1666 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1669 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1672 for( ; (k+2UL) <= kend; k+=2UL ) {
1674 const SIMDType b2( B.load(k+1UL,j) );
1675 xmm1 +=
set( A(i ,k ) ) * b1;
1676 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1677 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1678 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1679 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1680 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1683 for( ; k<kend; ++k ) {
1685 xmm1 +=
set( A(i ,k) ) * b1;
1686 xmm2 +=
set( A(i+1UL,k) ) * b1;
1687 xmm3 +=
set( A(i+2UL,k) ) * b1;
1690 C.store( i , j, xmm1+xmm4 );
1691 C.store( i+1UL, j, xmm2+xmm5 );
1692 C.store( i+2UL, j, xmm3+xmm6 );
1695 for( ; (i+2UL) <= iend; i+=2UL )
1697 const size_t kbegin( ( IsUpper_v<MT4> )
1698 ?( ( IsLower_v<MT5> )
1699 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1700 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1701 :( IsLower_v<MT5> ? j : 0UL ) );
1702 const size_t kend( ( IsLower_v<MT4> )
1703 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1709 for( ; (k+2UL) <= kend; k+=2UL ) {
1711 const SIMDType b2( B.load(k+1UL,j) );
1712 xmm1 +=
set( A(i ,k ) ) * b1;
1713 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1714 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1715 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1718 for( ; k<kend; ++k ) {
1720 xmm1 +=
set( A(i ,k) ) * b1;
1721 xmm2 +=
set( A(i+1UL,k) ) * b1;
1724 C.store( i , j, xmm1+xmm3 );
1725 C.store( i+1UL, j, xmm2+xmm4 );
1730 const size_t kbegin( ( IsUpper_v<MT4> )
1731 ?( ( IsLower_v<MT5> )
1732 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1733 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1734 :( IsLower_v<MT5> ? j : 0UL ) );
1739 for( ; (k+2UL) <= K; k+=2UL ) {
1740 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1741 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1745 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1748 C.store( i, j, xmm1+xmm2 );
1756 for(
size_t jj=j; jj<jjend; ++jj ) {
1763 for( ; remainder && j<N; ++j )
1769 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1778 for( ; (i+2UL) <= M; i+=2UL )
1780 const size_t kbegin( ( IsUpper_v<MT4> )
1781 ?( ( IsLower_v<MT5> )
1782 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1783 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1784 :( IsLower_v<MT5> ? j : 0UL ) );
1785 const size_t kend( ( IsLower_v<MT4> )
1786 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1792 for(
size_t k=kbegin; k<kend; ++k ) {
1793 value1 += A(i ,k) * B(k,j);
1794 value2 += A(i+1UL,k) * B(k,j);
1798 C(i+1UL,j) = value2;
1803 const size_t kbegin( ( IsUpper_v<MT4> )
1804 ?( ( IsLower_v<MT5> )
1805 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1806 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1807 :( IsLower_v<MT5> ? j : 0UL ) );
1811 for(
size_t k=kbegin; k<K; ++k ) {
1812 value += A(i,k) * B(k,j);
1837 template<
typename MT3
1840 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1841 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1843 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
1845 const size_t M( A.rows() );
1846 const size_t N( B.columns() );
1847 const size_t K( A.columns() );
1851 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
1856 if( IsIntegral_v<ElementType> )
1859 for(
size_t j=0UL; j<N; ++j )
1861 const size_t kbegin( ( IsLower_v<MT5> )
1862 ?( ( IsUpper_v<MT4> )
1863 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1864 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1865 :( IsUpper_v<MT4> ? i : 0UL ) );
1866 const size_t kend( ( IsUpper_v<MT5> )
1867 ?( ( IsLower_v<MT4> )
1868 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1869 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
1870 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
1872 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1874 for(
size_t k=kbegin; k<kend; ++k ) {
1876 xmm1 += A.load(i ,k) * b1;
1877 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1878 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1879 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1880 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1881 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
1882 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
1883 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
1886 C.store( i , j, xmm1 );
1888 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1889 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1890 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1891 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
1892 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
1893 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
1902 for( ; (j+2UL) <= N; j+=2UL )
1904 const size_t kbegin( ( IsLower_v<MT5> )
1905 ?( ( IsUpper_v<MT4> )
1906 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1907 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1908 :( IsUpper_v<MT4> ? i : 0UL ) );
1909 const size_t kend( ( IsUpper_v<MT5> )
1910 ?( ( IsLower_v<MT4> )
1911 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1912 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1913 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
1915 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
1917 for(
size_t k=kbegin; k<kend; ++k ) {
1937 C.store( i , j , xmm1 );
1939 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1940 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1941 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
1942 C.store( i , j+1UL, xmm6 );
1943 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
1944 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
1945 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
1946 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
1951 const size_t kbegin( ( IsLower_v<MT5> )
1952 ?( ( IsUpper_v<MT4> )
1953 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1954 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1955 :( IsUpper_v<MT4> ? i : 0UL ) );
1956 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
1958 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1960 for(
size_t k=kbegin; k<kend; ++k ) {
1962 xmm1 += A.load(i ,k) * b1;
1963 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1964 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1965 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1966 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1969 C.store( i , j, xmm1 );
1971 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1972 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1973 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1985 for(
size_t ii=i; ii<iiend; ++ii ) {
1986 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1993 for(
size_t ii=i; ii<iiend; ++ii ) {
1999 for( ; (j+2UL) <= jend; j+=2UL )
2001 const size_t kbegin( ( IsLower_v<MT5> )
2002 ?( ( IsUpper_v<MT4> )
2003 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2004 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2005 :( IsUpper_v<MT4> ? i : 0UL ) );
2006 const size_t kend( ( IsUpper_v<MT5> )
2007 ?( ( IsLower_v<MT4> )
2008 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2009 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2010 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
2012 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2014 for(
size_t k=kbegin; k<kend; ++k ) {
2031 C.store( i , j , xmm1 );
2033 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2034 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2035 C.store( i , j+1UL, xmm5 );
2036 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
2037 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
2038 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
2043 const size_t kbegin( ( IsLower_v<MT5> )
2044 ?( ( IsUpper_v<MT4> )
2045 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2046 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2047 :( IsUpper_v<MT4> ? i : 0UL ) );
2048 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
2052 for(
size_t k=kbegin; k<kend; ++k ) {
2054 xmm1 += A.load(i ,k) * b1;
2055 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2056 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2057 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2060 C.store( i , j, xmm1 );
2062 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2063 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2071 for(
size_t ii=i; ii<iiend; ++ii ) {
2086 for(
size_t ii=i; ii<iiend; ++ii ) {
2087 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2094 for(
size_t ii=i; ii<iiend; ++ii ) {
2100 for( ; (j+2UL) <= jend; j+=2UL )
2102 const size_t kbegin( ( IsLower_v<MT5> )
2103 ?( ( IsUpper_v<MT4> )
2104 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2105 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2106 :( IsUpper_v<MT4> ? i : 0UL ) );
2107 const size_t kend( ( IsUpper_v<MT5> )
2108 ?( ( IsLower_v<MT4> )
2109 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2110 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2111 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
2113 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2115 for(
size_t k=kbegin; k<kend; ++k ) {
2129 C.store( i , j , xmm1 );
2131 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2132 C.store( i , j+1UL, xmm4 );
2133 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2134 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2139 const size_t kbegin( ( IsLower_v<MT5> )
2140 ?( ( IsUpper_v<MT4> )
2141 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2142 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2143 :( IsUpper_v<MT4> ? i : 0UL ) );
2144 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2148 for(
size_t k=kbegin; k<kend; ++k ) {
2150 xmm1 += A.load(i ,k) * b1;
2151 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2152 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2155 C.store( i , j, xmm1 );
2157 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2165 for(
size_t ii=i; ii<iiend; ++ii ) {
2180 for(
size_t ii=i; ii<iiend; ++ii ) {
2181 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2188 for(
size_t ii=i; ii<iiend; ++ii ) {
2194 for( ; (j+4UL) <= jend; j+=4UL )
2196 const size_t kbegin( ( IsLower_v<MT5> )
2197 ?( ( IsUpper_v<MT4> )
2198 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2199 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2200 :( IsUpper_v<MT4> ? i : 0UL ) );
2201 const size_t kend( ( IsUpper_v<MT5> )
2202 ?( ( IsLower_v<MT4> )
2203 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2204 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2205 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2207 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2209 for(
size_t k=kbegin; k<kend; ++k ) {
2226 C.store( i , j , xmm1 );
2228 C.store( i , j+1UL, xmm3 );
2229 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2230 C.store( i , j+2UL, xmm5 );
2231 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2232 C.store( i , j+3UL, xmm7 );
2233 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2236 for( ; (j+3UL) <= jend; j+=3UL )
2238 const size_t kbegin( ( IsLower_v<MT5> )
2239 ?( ( IsUpper_v<MT4> )
2240 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2241 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2242 :( IsUpper_v<MT4> ? i : 0UL ) );
2243 const size_t kend( ( IsUpper_v<MT5> )
2244 ?( ( IsLower_v<MT4> )
2245 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2246 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2247 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2249 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2251 for(
size_t k=kbegin; k<kend; ++k ) {
2265 C.store( i , j , xmm1 );
2267 C.store( i , j+1UL, xmm3 );
2268 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2269 C.store( i , j+2UL, xmm5 );
2270 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2273 for( ; (j+2UL) <= jend; j+=2UL )
2275 const size_t kbegin( ( IsLower_v<MT5> )
2276 ?( ( IsUpper_v<MT4> )
2277 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2278 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2279 :( IsUpper_v<MT4> ? i : 0UL ) );
2280 const size_t kend( ( IsUpper_v<MT5> )
2281 ?( ( IsLower_v<MT4> )
2282 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2283 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2284 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2286 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2289 for( ; (k+2UL) <= kend; k+=2UL ) {
2290 const SIMDType a1( A.load(i ,k ) );
2292 const SIMDType a3( A.load(i ,k+1UL) );
2308 for( ; k<kend; ++k ) {
2319 C.store( i , j , xmm1+xmm5 );
2320 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
2321 C.store( i , j+1UL, xmm3+xmm7 );
2322 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
2327 const size_t kbegin( ( IsLower_v<MT5> )
2328 ?( ( IsUpper_v<MT4> )
2329 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2330 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2331 :( IsUpper_v<MT4> ? i : 0UL ) );
2332 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2337 for( ; (k+2UL) <= kend; k+=2UL ) {
2340 xmm1 += A.load(i ,k ) * b1;
2341 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
2342 xmm3 += A.load(i ,k+1UL) * b2;
2343 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
2346 for( ; k<kend; ++k ) {
2348 xmm1 += A.load(i ,k) * b1;
2352 C.store( i , j, xmm1+xmm3 );
2353 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
2361 for(
size_t ii=i; ii<iiend; ++ii ) {
2376 for(
size_t ii=i; ii<iiend; ++ii ) {
2377 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2384 for(
size_t ii=i; ii<iiend; ++ii ) {
2390 for( ; (j+4UL) <= jend; j+=4UL )
2392 const size_t kbegin( ( IsLower_v<MT5> )
2393 ?( ( IsUpper_v<MT4> )
2394 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2395 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2396 :( IsUpper_v<MT4> ? i : 0UL ) );
2397 const size_t kend( ( IsUpper_v<MT5> )
2398 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2401 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2404 for( ; (k+2UL) <= kend; k+=2UL ) {
2406 const SIMDType a2( A.load(i,k+1UL) );
2407 xmm1 += a1 *
set( B(k ,j ) );
2408 xmm2 += a1 *
set( B(k ,j+1UL) );
2409 xmm3 += a1 *
set( B(k ,j+2UL) );
2410 xmm4 += a1 *
set( B(k ,j+3UL) );
2411 xmm5 += a2 *
set( B(k+1UL,j ) );
2412 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2413 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2414 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2417 for( ; k<kend; ++k ) {
2419 xmm1 += a1 *
set( B(k,j ) );
2420 xmm2 += a1 *
set( B(k,j+1UL) );
2421 xmm3 += a1 *
set( B(k,j+2UL) );
2422 xmm4 += a1 *
set( B(k,j+3UL) );
2425 C.store( i, j , xmm1+xmm5 );
2426 C.store( i, j+1UL, xmm2+xmm6 );
2427 C.store( i, j+2UL, xmm3+xmm7 );
2428 C.store( i, j+3UL, xmm4+xmm8 );
2431 for( ; (j+3UL) <= jend; j+=3UL )
2433 const size_t kbegin( ( IsLower_v<MT5> )
2434 ?( ( IsUpper_v<MT4> )
2435 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2436 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2437 :( IsUpper_v<MT4> ? i : 0UL ) );
2438 const size_t kend( ( IsUpper_v<MT5> )
2439 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2442 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
2445 for( ; (k+2UL) <= kend; k+=2UL ) {
2447 const SIMDType a2( A.load(i,k+1UL) );
2448 xmm1 += a1 *
set( B(k ,j ) );
2449 xmm2 += a1 *
set( B(k ,j+1UL) );
2450 xmm3 += a1 *
set( B(k ,j+2UL) );
2451 xmm4 += a2 *
set( B(k+1UL,j ) );
2452 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2453 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2456 for( ; k<kend; ++k ) {
2458 xmm1 += a1 *
set( B(k,j ) );
2459 xmm2 += a1 *
set( B(k,j+1UL) );
2460 xmm3 += a1 *
set( B(k,j+2UL) );
2463 C.store( i, j , xmm1+xmm4 );
2464 C.store( i, j+1UL, xmm2+xmm5 );
2465 C.store( i, j+2UL, xmm3+xmm6 );
2468 for( ; (j+2UL) <= jend; j+=2UL )
2470 const size_t kbegin( ( IsLower_v<MT5> )
2471 ?( ( IsUpper_v<MT4> )
2472 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2473 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2474 :( IsUpper_v<MT4> ? i : 0UL ) );
2475 const size_t kend( ( IsUpper_v<MT5> )
2476 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2482 for( ; (k+2UL) <= kend; k+=2UL ) {
2484 const SIMDType a2( A.load(i,k+1UL) );
2485 xmm1 += a1 *
set( B(k ,j ) );
2486 xmm2 += a1 *
set( B(k ,j+1UL) );
2487 xmm3 += a2 *
set( B(k+1UL,j ) );
2488 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2491 for( ; k<kend; ++k ) {
2493 xmm1 += a1 *
set( B(k,j ) );
2494 xmm2 += a1 *
set( B(k,j+1UL) );
2497 C.store( i, j , xmm1+xmm3 );
2498 C.store( i, j+1UL, xmm2+xmm4 );
2503 const size_t kbegin( ( IsLower_v<MT5> )
2504 ?( ( IsUpper_v<MT4> )
2505 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2506 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2507 :( IsUpper_v<MT4> ? i : 0UL ) );
2512 for( ; (k+2UL) <= K; k+=2UL ) {
2513 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2514 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2518 xmm1 += A.load(i,k) *
set( B(k,j) );
2521 C.store( i, j, xmm1+xmm2 );
2529 for(
size_t ii=i; ii<iiend; ++ii ) {
2536 for( ; remainder && i<M; ++i )
2542 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
2551 for( ; (j+2UL) <= N; j+=2UL )
2553 const size_t kbegin( ( IsLower_v<MT5> )
2554 ?( ( IsUpper_v<MT4> )
2555 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2556 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2557 :( IsUpper_v<MT4> ? i : 0UL ) );
2558 const size_t kend( ( IsUpper_v<MT5> )
2559 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2565 for(
size_t k=kbegin; k<kend; ++k ) {
2566 value1 += A(i,k) * B(k,j );
2567 value2 += A(i,k) * B(k,j+1UL);
2571 C(i,j+1UL) = value2;
2576 const size_t kbegin( ( IsLower_v<MT5> )
2577 ?( ( IsUpper_v<MT4> )
2578 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2579 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2580 :( IsUpper_v<MT4> ? i : 0UL ) );
2584 for(
size_t k=kbegin; k<K; ++k ) {
2585 value += A(i,k) * B(k,j);
2609 template<
typename MT3
2612 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2613 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2615 selectDefaultAssignKernel( C, A, B );
2635 template<
typename MT3
2638 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2639 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2669 template<
typename MT3
2672 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2673 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2675 selectLargeAssignKernel( C, A, B );
2681 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2695 template<
typename MT3
2698 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2699 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2701 using ET = ElementType_t<MT3>;
2703 if( IsTriangular_v<MT4> ) {
2705 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2707 else if( IsTriangular_v<MT5> ) {
2709 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2712 gemm( C, A, B, ET(1), ET(0) );
2732 template<
typename MT
2734 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2738 using TmpType = If_t< SO, ResultType, OppositeType >;
2750 const ForwardFunctor fwd;
2752 const TmpType tmp(
serial( rhs ) );
2753 assign( ~lhs, fwd( tmp ) );
2771 template<
typename MT
2773 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
2780 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2794 TDMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
2810 template<
typename MT3
2813 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2815 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
2816 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
2817 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
2818 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
2819 selectSmallAddAssignKernel( C, A, B );
2821 selectBlasAddAssignKernel( C, A, B );
2840 template<
typename MT3
2843 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2844 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2846 const size_t M( A.rows() );
2847 const size_t N( B.columns() );
2848 const size_t K( A.columns() );
2852 for(
size_t i=0UL; i<M; ++i )
2854 const size_t kbegin( ( IsUpper_v<MT4> )
2855 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2857 const size_t kend( ( IsLower_v<MT4> )
2858 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2862 for(
size_t k=kbegin; k<kend; ++k )
2864 const size_t jbegin( ( IsUpper_v<MT5> )
2865 ?( ( IsStrictlyUpper_v<MT5> )
2866 ?(
UPP ?
max(i,k+1UL) : k+1UL )
2867 :(
UPP ?
max(i,k) : k ) )
2868 :(
UPP ? i : 0UL ) );
2869 const size_t jend( ( IsLower_v<MT5> )
2870 ?( ( IsStrictlyLower_v<MT5> )
2871 ?(
LOW ?
min(i+1UL,k) : k )
2872 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
2873 :(
LOW ? i+1UL : N ) );
2875 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
2878 const size_t jnum( jend - jbegin );
2879 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2881 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2882 C(i,j ) += A(i,k) * B(k,j );
2883 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2886 C(i,jpos) += A(i,k) * B(k,jpos);
2908 template<
typename MT3
2911 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2912 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2914 const size_t M( A.rows() );
2915 const size_t N( B.columns() );
2916 const size_t K( A.columns() );
2920 for(
size_t j=0UL; j<N; ++j )
2922 const size_t kbegin( ( IsLower_v<MT5> )
2923 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2925 const size_t kend( ( IsUpper_v<MT5> )
2926 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2930 for(
size_t k=kbegin; k<kend; ++k )
2932 const size_t ibegin( ( IsLower_v<MT4> )
2933 ?( ( IsStrictlyLower_v<MT4> )
2934 ?(
LOW ?
max(j,k+1UL) : k+1UL )
2935 :(
LOW ?
max(j,k) : k ) )
2936 :(
LOW ? j : 0UL ) );
2937 const size_t iend( ( IsUpper_v<MT4> )
2938 ?( ( IsStrictlyUpper_v<MT4> )
2939 ?(
UPP ?
min(j+1UL,k) : k )
2940 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
2941 :(
UPP ? j+1UL : M ) );
2943 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
2946 const size_t inum( iend - ibegin );
2947 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2949 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2950 C(i ,j) += A(i ,k) * B(k,j);
2951 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2954 C(ipos,j) += A(ipos,k) * B(k,j);
2976 template<
typename MT3
2979 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2980 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2982 constexpr
size_t block( BLOCK_SIZE );
2984 const size_t M( A.rows() );
2985 const size_t N( B.columns() );
2987 for(
size_t ii=0UL; ii<M; ii+=block ) {
2988 const size_t iend(
min( M, ii+block ) );
2989 for(
size_t jj=0UL; jj<N; jj+=block ) {
2990 const size_t jend(
min( N, jj+block ) );
2991 for(
size_t i=ii; i<iend; ++i )
2993 const size_t jbegin( ( IsUpper_v<MT4> )
2994 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
2996 const size_t jpos( ( IsLower_v<MT4> )
2997 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
3000 for(
size_t j=jbegin; j<jpos; ++j ) {
3001 C(i,j) += A(i,j) * B(j,j);
3024 template<
typename MT3
3027 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3028 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3030 const size_t M( A.rows() );
3031 const size_t N( B.columns() );
3033 for(
size_t j=0UL; j<N; ++j )
3035 const size_t ibegin( ( IsLower_v<MT4> )
3036 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3038 const size_t iend( ( IsUpper_v<MT4> )
3039 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3043 const size_t inum( iend - ibegin );
3044 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3046 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3047 C(i ,j) += A(i ,j) * B(j,j);
3048 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
3051 C(ipos,j) += A(ipos,j) * B(j,j);
3072 template<
typename MT3
3075 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3076 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3078 const size_t M( A.rows() );
3079 const size_t N( B.columns() );
3081 for(
size_t i=0UL; i<M; ++i )
3083 const size_t jbegin( ( IsUpper_v<MT5> )
3084 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3086 const size_t jend( ( IsLower_v<MT5> )
3087 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3091 const size_t jnum( jend - jbegin );
3092 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3094 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3095 C(i,j ) += A(i,i) * B(i,j );
3096 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
3099 C(i,jpos) += A(i,i) * B(i,jpos);
3120 template<
typename MT3
3123 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3124 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3126 constexpr
size_t block( BLOCK_SIZE );
3128 const size_t M( A.rows() );
3129 const size_t N( B.columns() );
3131 for(
size_t jj=0UL; jj<N; jj+=block ) {
3132 const size_t jend(
min( N, jj+block ) );
3133 for(
size_t ii=0UL; ii<M; ii+=block ) {
3134 const size_t iend(
min( M, ii+block ) );
3135 for(
size_t j=jj; j<jend; ++j )
3137 const size_t ibegin( ( IsLower_v<MT5> )
3138 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
3140 const size_t ipos( ( IsUpper_v<MT5> )
3141 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
3144 for(
size_t i=ibegin; i<ipos; ++i ) {
3145 C(i,j) += A(i,i) * B(i,j);
3168 template<
typename MT3
3171 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3172 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3174 for(
size_t i=0UL; i<A.rows(); ++i ) {
3175 C(i,i) += A(i,i) * B(i,i);
3195 template<
typename MT3
3198 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3199 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3201 selectDefaultAddAssignKernel( C, A, B );
3221 template<
typename MT3
3224 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3225 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3227 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3229 const size_t M( A.rows() );
3230 const size_t N( B.columns() );
3231 const size_t K( A.columns() );
3235 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3240 if( IsIntegral_v<ElementType> )
3243 for(
size_t i=0UL; i<M; ++i )
3245 const size_t kbegin( ( IsUpper_v<MT4> )
3246 ?( ( IsLower_v<MT5> )
3247 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3248 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3249 :( IsLower_v<MT5> ? j : 0UL ) );
3250 const size_t kend( ( IsLower_v<MT4> )
3251 ?( ( IsUpper_v<MT5> )
3252 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3253 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3254 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3265 for(
size_t k=kbegin; k<kend; ++k ) {
3267 xmm1 += a1 * B.load(k,j );
3268 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3269 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3270 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3271 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3272 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
3273 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
3274 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
3277 C.store( i, j , xmm1 );
3279 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3280 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3281 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3282 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3283 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3284 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3293 for( ; (i+2UL) <= M; i+=2UL )
3295 const size_t kbegin( ( IsUpper_v<MT4> )
3296 ?( ( IsLower_v<MT5> )
3297 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3298 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3299 :( IsLower_v<MT5> ? j : 0UL ) );
3300 const size_t kend( ( IsLower_v<MT4> )
3301 ?( ( IsUpper_v<MT5> )
3302 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3303 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3304 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3311 SIMDType xmm6 ( C.load(i+1UL,j ) );
3317 for(
size_t k=kbegin; k<kend; ++k ) {
3337 C.store( i , j , xmm1 );
3339 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3340 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3341 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3342 C.store( i+1UL, j , xmm6 );
3343 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3344 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3345 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3346 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3351 const size_t kbegin( ( IsUpper_v<MT4> )
3352 ?( ( IsLower_v<MT5> )
3353 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3354 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3355 :( IsLower_v<MT5> ? j : 0UL ) );
3356 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3364 for(
size_t k=kbegin; k<kend; ++k ) {
3366 xmm1 += a1 * B.load(k,j );
3367 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3368 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3369 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3370 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3373 C.store( i, j , xmm1 );
3375 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3376 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3377 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3385 for( ; (i+2UL) <= M; i+=2UL )
3387 const size_t kbegin( ( IsUpper_v<MT4> )
3388 ?( ( IsLower_v<MT5> )
3389 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3390 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3391 :( IsLower_v<MT5> ? j : 0UL ) );
3392 const size_t kend( ( IsLower_v<MT4> )
3393 ?( ( IsUpper_v<MT5> )
3394 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3395 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3396 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3407 for(
size_t k=kbegin; k<kend; ++k ) {
3424 C.store( i , j , xmm1 );
3426 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3427 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3428 C.store( i+1UL, j , xmm5 );
3429 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3430 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3431 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3436 const size_t kbegin( ( IsUpper_v<MT4> )
3437 ?( ( IsLower_v<MT5> )
3438 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3439 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3440 :( IsLower_v<MT5> ? j : 0UL ) );
3441 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3448 for(
size_t k=kbegin; k<kend; ++k ) {
3450 xmm1 += a1 * B.load(k,j );
3451 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3452 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3453 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3456 C.store( i, j , xmm1 );
3458 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3459 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3467 for( ; (i+2UL) <= M; i+=2UL )
3469 const size_t kbegin( ( IsUpper_v<MT4> )
3470 ?( ( IsLower_v<MT5> )
3471 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3472 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3473 :( IsLower_v<MT5> ? j : 0UL ) );
3474 const size_t kend( ( IsLower_v<MT4> )
3475 ?( ( IsUpper_v<MT5> )
3476 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3477 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3478 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
3487 for(
size_t k=kbegin; k<kend; ++k ) {
3501 C.store( i , j , xmm1 );
3503 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3504 C.store( i+1UL, j , xmm4 );
3505 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
3506 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
3511 const size_t kbegin( ( IsUpper_v<MT4> )
3512 ?( ( IsLower_v<MT5> )
3513 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3514 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3515 :( IsLower_v<MT5> ? j : 0UL ) );
3516 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
3522 for(
size_t k=kbegin; k<kend; ++k ) {
3524 xmm1 += a1 * B.load(k,j );
3525 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3526 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3529 C.store( i, j , xmm1 );
3531 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3538 size_t i(
LOW ? j : 0UL );
3540 for( ; (i+4UL) <= iend; i+=4UL )
3542 const size_t kbegin( ( IsUpper_v<MT4> )
3543 ?( ( IsLower_v<MT5> )
3544 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3545 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3546 :( IsLower_v<MT5> ? j : 0UL ) );
3547 const size_t kend( ( IsLower_v<MT4> )
3548 ?( ( IsUpper_v<MT5> )
3549 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
3550 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3551 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3562 for(
size_t k=kbegin; k<kend; ++k ) {
3579 C.store( i , j , xmm1 );
3581 C.store( i+1UL, j , xmm3 );
3582 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3583 C.store( i+2UL, j , xmm5 );
3584 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3585 C.store( i+3UL, j , xmm7 );
3586 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
3589 for( ; (i+3UL) <= iend; i+=3UL )
3591 const size_t kbegin( ( IsUpper_v<MT4> )
3592 ?( ( IsLower_v<MT5> )
3593 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3594 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3595 :( IsLower_v<MT5> ? j : 0UL ) );
3596 const size_t kend( ( IsLower_v<MT4> )
3597 ?( ( IsUpper_v<MT5> )
3598 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
3599 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3600 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3609 for(
size_t k=kbegin; k<kend; ++k ) {
3623 C.store( i , j , xmm1 );
3625 C.store( i+1UL, j , xmm3 );
3626 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3627 C.store( i+2UL, j , xmm5 );
3628 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3631 for( ; (i+2UL) <= iend; i+=2UL )
3633 const size_t kbegin( ( IsUpper_v<MT4> )
3634 ?( ( IsLower_v<MT5> )
3635 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3636 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3637 :( IsLower_v<MT5> ? j : 0UL ) );
3638 const size_t kend( ( IsLower_v<MT4> )
3639 ?( ( IsUpper_v<MT5> )
3640 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
3641 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3642 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3651 for( ; (k+2UL) <= kend; k+=2UL ) {
3656 const SIMDType b1( B.load(k ,j ) );
3658 const SIMDType b3( B.load(k+1UL,j ) );
3670 for( ; k<kend; ++k ) {
3681 C.store( i , j , xmm1+xmm5 );
3682 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
3683 C.store( i+1UL, j , xmm3+xmm7 );
3684 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
3689 const size_t kbegin( ( IsUpper_v<MT4> )
3690 ?( ( IsLower_v<MT5> )
3691 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3692 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3693 :( IsLower_v<MT5> ? j : 0UL ) );
3694 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
3701 for( ; (k+2UL) <= kend; k+=2UL ) {
3704 xmm1 += a1 * B.load(k ,j );
3705 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
3706 xmm3 += a2 * B.load(k+1UL,j );
3707 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
3710 for( ; k<kend; ++k ) {
3712 xmm1 += a1 * B.load(k,j );
3716 C.store( i, j , xmm1+xmm3 );
3717 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
3724 size_t i(
LOW ? j : 0UL );
3726 for( ; (i+4UL) <= iend; i+=4UL )
3728 const size_t kbegin( ( IsUpper_v<MT4> )
3729 ?( ( IsLower_v<MT5> )
3730 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3731 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3732 :( IsLower_v<MT5> ? j : 0UL ) );
3733 const size_t kend( ( IsLower_v<MT4> )
3734 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3744 for( ; (k+2UL) <= kend; k+=2UL ) {
3746 const SIMDType b2( B.load(k+1UL,j) );
3747 xmm1 +=
set( A(i ,k ) ) * b1;
3748 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3749 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3750 xmm4 +=
set( A(i+3UL,k ) ) * b1;
3751 xmm5 +=
set( A(i ,k+1UL) ) * b2;
3752 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
3753 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
3754 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
3757 for( ; k<kend; ++k ) {
3759 xmm1 +=
set( A(i ,k) ) * b1;
3760 xmm2 +=
set( A(i+1UL,k) ) * b1;
3761 xmm3 +=
set( A(i+2UL,k) ) * b1;
3762 xmm4 +=
set( A(i+3UL,k) ) * b1;
3765 C.store( i , j, xmm1+xmm5 );
3766 C.store( i+1UL, j, xmm2+xmm6 );
3767 C.store( i+2UL, j, xmm3+xmm7 );
3768 C.store( i+3UL, j, xmm4+xmm8 );
3771 for( ; (i+3UL) <= iend; i+=3UL )
3773 const size_t kbegin( ( IsUpper_v<MT4> )
3774 ?( ( IsLower_v<MT5> )
3775 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3776 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3777 :( IsLower_v<MT5> ? j : 0UL ) );
3778 const size_t kend( ( IsLower_v<MT4> )
3779 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3788 for( ; (k+2UL) <= kend; k+=2UL ) {
3790 const SIMDType b2( B.load(k+1UL,j) );
3791 xmm1 +=
set( A(i ,k ) ) * b1;
3792 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3793 xmm3 +=
set( A(i+2UL,k ) ) * b1;
3794 xmm4 +=
set( A(i ,k+1UL) ) * b2;
3795 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
3796 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
3799 for( ; k<kend; ++k ) {
3801 xmm1 +=
set( A(i ,k) ) * b1;
3802 xmm2 +=
set( A(i+1UL,k) ) * b1;
3803 xmm3 +=
set( A(i+2UL,k) ) * b1;
3806 C.store( i , j, xmm1+xmm4 );
3807 C.store( i+1UL, j, xmm2+xmm5 );
3808 C.store( i+2UL, j, xmm3+xmm6 );
3811 for( ; (i+2UL) <= iend; i+=2UL )
3813 const size_t kbegin( ( IsUpper_v<MT4> )
3814 ?( ( IsLower_v<MT5> )
3815 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3816 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3817 :( IsLower_v<MT5> ? j : 0UL ) );
3818 const size_t kend( ( IsLower_v<MT4> )
3819 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3827 for( ; (k+2UL) <= kend; k+=2UL ) {
3829 const SIMDType b2( B.load(k+1UL,j) );
3830 xmm1 +=
set( A(i ,k ) ) * b1;
3831 xmm2 +=
set( A(i+1UL,k ) ) * b1;
3832 xmm3 +=
set( A(i ,k+1UL) ) * b2;
3833 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
3836 for( ; k<kend; ++k ) {
3838 xmm1 +=
set( A(i ,k) ) * b1;
3839 xmm2 +=
set( A(i+1UL,k) ) * b1;
3842 C.store( i , j, xmm1+xmm3 );
3843 C.store( i+1UL, j, xmm2+xmm4 );
3848 const size_t kbegin( ( IsUpper_v<MT4> )
3849 ?( ( IsLower_v<MT5> )
3850 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3851 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3852 :( IsLower_v<MT5> ? j : 0UL ) );
3858 for( ; (k+2UL) <= K; k+=2UL ) {
3859 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
3860 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3864 xmm1 +=
set( A(i,k) ) * B.load(k,j);
3867 C.store( i, j, xmm1+xmm2 );
3871 for( ; remainder && j<N; ++j )
3873 const size_t iend(
UPP ? j+1UL : M );
3874 size_t i(
LOW ? j : 0UL );
3876 for( ; (i+2UL) <= iend; i+=2UL )
3878 const size_t kbegin( ( IsUpper_v<MT4> )
3879 ?( ( IsLower_v<MT5> )
3880 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3881 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3882 :( IsLower_v<MT5> ? j : 0UL ) );
3883 const size_t kend( ( IsLower_v<MT4> )
3884 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3890 for(
size_t k=kbegin; k<kend; ++k ) {
3891 value1 += A(i ,k) * B(k,j);
3892 value2 += A(i+1UL,k) * B(k,j);
3896 C(i+1UL,j) = value2;
3901 const size_t kbegin( ( IsUpper_v<MT4> )
3902 ?( ( IsLower_v<MT5> )
3903 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3904 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3905 :( IsLower_v<MT5> ? j : 0UL ) );
3909 for(
size_t k=kbegin; k<K; ++k ) {
3910 value += A(i,k) * B(k,j);
3935 template<
typename MT3
3938 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3939 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3941 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3943 const size_t M( A.rows() );
3944 const size_t N( B.columns() );
3945 const size_t K( A.columns() );
3949 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
3954 if( IsIntegral_v<ElementType> )
3957 for(
size_t j=0UL; j<N; ++j )
3959 const size_t kbegin( ( IsLower_v<MT5> )
3960 ?( ( IsUpper_v<MT4> )
3961 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3962 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3963 :( IsUpper_v<MT4> ? i : 0UL ) );
3964 const size_t kend( ( IsUpper_v<MT5> )
3965 ?( ( IsLower_v<MT4> )
3966 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3967 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3968 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
3979 for(
size_t k=kbegin; k<kend; ++k ) {
3981 xmm1 += A.load(i ,k) * b1;
3982 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
3983 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
3984 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
3985 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
3986 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
3987 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
3988 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
3991 C.store( i , j, xmm1 );
3993 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3994 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3995 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3996 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
3997 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
3998 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
4007 for( ; (j+2UL) <= N; j+=2UL )
4009 const size_t kbegin( ( IsLower_v<MT5> )
4010 ?( ( IsUpper_v<MT4> )
4011 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4012 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4013 :( IsUpper_v<MT4> ? i : 0UL ) );
4014 const size_t kend( ( IsUpper_v<MT5> )
4015 ?( ( IsLower_v<MT4> )
4016 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4017 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4018 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
4025 SIMDType xmm6 ( C.load(i ,j+1UL) );
4031 for(
size_t k=kbegin; k<kend; ++k ) {
4051 C.store( i , j , xmm1 );
4053 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4054 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
4055 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
4056 C.store( i , j+1UL, xmm6 );
4057 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
4058 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
4059 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
4060 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
4065 const size_t kbegin( ( IsLower_v<MT5> )
4066 ?( ( IsUpper_v<MT4> )
4067 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4068 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4069 :( IsUpper_v<MT4> ? i : 0UL ) );
4070 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
4078 for(
size_t k=kbegin; k<kend; ++k ) {
4080 xmm1 += A.load(i ,k) * b1;
4081 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4082 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4083 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
4084 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
4087 C.store( i , j, xmm1 );
4089 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4090 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
4091 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
4099 for( ; (j+2UL) <= N; j+=2UL )
4101 const size_t kbegin( ( IsLower_v<MT5> )
4102 ?( ( IsUpper_v<MT4> )
4103 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4104 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4105 :( IsUpper_v<MT4> ? i : 0UL ) );
4106 const size_t kend( ( IsUpper_v<MT5> )
4107 ?( ( IsLower_v<MT4> )
4108 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4109 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4110 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
4121 for(
size_t k=kbegin; k<kend; ++k ) {
4138 C.store( i , j , xmm1 );
4140 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4141 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
4142 C.store( i , j+1UL, xmm5 );
4143 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
4144 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
4145 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
4150 const size_t kbegin( ( IsLower_v<MT5> )
4151 ?( ( IsUpper_v<MT4> )
4152 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4153 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4154 :( IsUpper_v<MT4> ? i : 0UL ) );
4155 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
4162 for(
size_t k=kbegin; k<kend; ++k ) {
4164 xmm1 += A.load(i ,k) * b1;
4165 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4166 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4167 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
4170 C.store( i , j, xmm1 );
4172 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4173 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
4181 for( ; (j+2UL) <= N; j+=2UL )
4183 const size_t kbegin( ( IsLower_v<MT5> )
4184 ?( ( IsUpper_v<MT4> )
4185 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4186 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4187 :( IsUpper_v<MT4> ? i : 0UL ) );
4188 const size_t kend( ( IsUpper_v<MT5> )
4189 ?( ( IsLower_v<MT4> )
4190 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4191 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4192 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
4201 for(
size_t k=kbegin; k<kend; ++k ) {
4215 C.store( i , j , xmm1 );
4217 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4218 C.store( i , j+1UL, xmm4 );
4219 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
4220 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
4225 const size_t kbegin( ( IsLower_v<MT5> )
4226 ?( ( IsUpper_v<MT4> )
4227 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4228 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4229 :( IsUpper_v<MT4> ? i : 0UL ) );
4230 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
4236 for(
size_t k=kbegin; k<kend; ++k ) {
4238 xmm1 += A.load(i ,k) * b1;
4239 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4240 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4243 C.store( i , j, xmm1 );
4245 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4252 size_t j(
UPP ? i : 0UL );
4254 for( ; (j+4UL) <= jend; j+=4UL )
4256 const size_t kbegin( ( IsLower_v<MT5> )
4257 ?( ( IsUpper_v<MT4> )
4258 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4259 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4260 :( IsUpper_v<MT4> ? i : 0UL ) );
4261 const size_t kend( ( IsUpper_v<MT5> )
4262 ?( ( IsLower_v<MT4> )
4263 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4264 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4265 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4276 for(
size_t k=kbegin; k<kend; ++k ) {
4293 C.store( i , j , xmm1 );
4295 C.store( i , j+1UL, xmm3 );
4296 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4297 C.store( i , j+2UL, xmm5 );
4298 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4299 C.store( i , j+3UL, xmm7 );
4300 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
4303 for( ; (j+3UL) <= jend; j+=3UL )
4305 const size_t kbegin( ( IsLower_v<MT5> )
4306 ?( ( IsUpper_v<MT4> )
4307 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4308 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4309 :( IsUpper_v<MT4> ? i : 0UL ) );
4310 const size_t kend( ( IsUpper_v<MT5> )
4311 ?( ( IsLower_v<MT4> )
4312 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4313 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4314 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4323 for(
size_t k=kbegin; k<kend; ++k ) {
4337 C.store( i , j , xmm1 );
4339 C.store( i , j+1UL, xmm3 );
4340 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4341 C.store( i , j+2UL, xmm5 );
4342 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4345 for( ; (j+2UL) <= jend; j+=2UL )
4347 const size_t kbegin( ( IsLower_v<MT5> )
4348 ?( ( IsUpper_v<MT4> )
4349 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4350 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4351 :( IsUpper_v<MT4> ? i : 0UL ) );
4352 const size_t kend( ( IsUpper_v<MT5> )
4353 ?( ( IsLower_v<MT4> )
4354 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4355 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4356 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4365 for( ; (k+2UL) < kend; k+=2UL ) {
4366 const SIMDType a1( A.load(i ,k ) );
4368 const SIMDType a3( A.load(i ,k+1UL) );
4384 for( ; k<kend; ++k ) {
4395 C.store( i , j , xmm1+xmm5 );
4396 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
4397 C.store( i , j+1UL, xmm3+xmm7 );
4398 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
4403 const size_t kbegin( ( IsLower_v<MT5> )
4404 ?( ( IsUpper_v<MT4> )
4405 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4406 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4407 :( IsUpper_v<MT4> ? i : 0UL ) );
4408 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
4415 for( ; (k+2UL) <= kend; k+=2UL ) {
4418 xmm1 += A.load(i ,k ) * b1;
4419 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
4420 xmm3 += A.load(i ,k+1UL) * b2;
4421 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
4424 for( ; k<kend; ++k ) {
4426 xmm1 += A.load(i ,k) * b1;
4430 C.store( i , j, xmm1+xmm3 );
4431 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
4438 size_t j(
UPP ? i : 0UL );
4440 for( ; (j+4UL) <= jend; j+=4UL )
4442 const size_t kbegin( ( IsLower_v<MT5> )
4443 ?( ( IsUpper_v<MT4> )
4444 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4445 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4446 :( IsUpper_v<MT4> ? i : 0UL ) );
4447 const size_t kend( ( IsUpper_v<MT5> )
4448 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4458 for( ; (k+2UL) <= kend; k+=2UL ) {
4460 const SIMDType a2( A.load(i,k+1UL) );
4461 xmm1 += a1 *
set( B(k ,j ) );
4462 xmm2 += a1 *
set( B(k ,j+1UL) );
4463 xmm3 += a1 *
set( B(k ,j+2UL) );
4464 xmm4 += a1 *
set( B(k ,j+3UL) );
4465 xmm5 += a2 *
set( B(k+1UL,j ) );
4466 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
4467 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
4468 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
4471 for( ; k<kend; ++k ) {
4473 xmm1 += a1 *
set( B(k,j ) );
4474 xmm2 += a1 *
set( B(k,j+1UL) );
4475 xmm3 += a1 *
set( B(k,j+2UL) );
4476 xmm4 += a1 *
set( B(k,j+3UL) );
4479 C.store( i, j , xmm1+xmm5 );
4480 C.store( i, j+1UL, xmm2+xmm6 );
4481 C.store( i, j+2UL, xmm3+xmm7 );
4482 C.store( i, j+3UL, xmm4+xmm8 );
4485 for( ; (j+3UL) <= jend; j+=3UL )
4487 const size_t kbegin( ( IsLower_v<MT5> )
4488 ?( ( IsUpper_v<MT4> )
4489 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4490 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4491 :( IsUpper_v<MT4> ? i : 0UL ) );
4492 const size_t kend( ( IsUpper_v<MT5> )
4493 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4502 for( ; (k+2UL) <= kend; k+=2UL ) {
4504 const SIMDType a2( A.load(i,k+1UL) );
4505 xmm1 += a1 *
set( B(k ,j ) );
4506 xmm2 += a1 *
set( B(k ,j+1UL) );
4507 xmm3 += a1 *
set( B(k ,j+2UL) );
4508 xmm4 += a2 *
set( B(k+1UL,j ) );
4509 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
4510 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
4513 for( ; k<kend; ++k ) {
4515 xmm1 += a1 *
set( B(k,j ) );
4516 xmm2 += a1 *
set( B(k,j+1UL) );
4517 xmm3 += a1 *
set( B(k,j+2UL) );
4520 C.store( i, j , xmm1+xmm4 );
4521 C.store( i, j+1UL, xmm2+xmm5 );
4522 C.store( i, j+2UL, xmm3+xmm6 );
4525 for( ; (j+2UL) <= jend; j+=2UL )
4527 const size_t kbegin( ( IsLower_v<MT5> )
4528 ?( ( IsUpper_v<MT4> )
4529 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4530 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4531 :( IsUpper_v<MT4> ? i : 0UL ) );
4532 const size_t kend( ( IsUpper_v<MT5> )
4533 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4541 for( ; (k+2UL) <= kend; k+=2UL ) {
4543 const SIMDType a2( A.load(i,k+1UL) );
4544 xmm1 += a1 *
set( B(k ,j ) );
4545 xmm2 += a1 *
set( B(k ,j+1UL) );
4546 xmm3 += a2 *
set( B(k+1UL,j ) );
4547 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
4550 for( ; k<kend; ++k ) {
4552 xmm1 += a1 *
set( B(k,j ) );
4553 xmm2 += a1 *
set( B(k,j+1UL) );
4556 C.store( i, j , xmm1+xmm3 );
4557 C.store( i, j+1UL, xmm2+xmm4 );
4562 const size_t kbegin( ( IsLower_v<MT5> )
4563 ?( ( IsUpper_v<MT4> )
4564 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4565 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4566 :( IsUpper_v<MT4> ? i : 0UL ) );
4572 for( ; (k+2UL) <= K; k+=2UL ) {
4573 xmm1 += A.load(i,k ) *
set( B(k ,j) );
4574 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
4578 xmm1 += A.load(i,k) *
set( B(k,j) );
4581 C.store( i, j, xmm1+xmm2 );
4585 for( ; remainder && i<M; ++i )
4587 const size_t jend(
LOW ? i+1UL : N );
4588 size_t j(
UPP ? i : 0UL );
4590 for( ; (j+2UL) <= jend; j+=2UL )
4592 const size_t kbegin( ( IsLower_v<MT5> )
4593 ?( ( IsUpper_v<MT4> )
4594 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4595 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4596 :( IsUpper_v<MT4> ? i : 0UL ) );
4597 const size_t kend( ( IsUpper_v<MT5> )
4598 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4604 for(
size_t k=kbegin; k<kend; ++k ) {
4605 value1 += A(i,k) * B(k,j );
4606 value2 += A(i,k) * B(k,j+1UL);
4610 C(i,j+1UL) = value2;
4615 const size_t kbegin( ( IsLower_v<MT5> )
4616 ?( ( IsUpper_v<MT4> )
4617 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4618 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4619 :( IsUpper_v<MT4> ? i : 0UL ) );
4623 for(
size_t k=kbegin; k<K; ++k ) {
4624 value += A(i,k) * B(k,j);
4648 template<
typename MT3
4651 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4652 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4654 selectDefaultAddAssignKernel( C, A, B );
4674 template<
typename MT3
4677 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4678 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4704 template<
typename MT3
4707 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4708 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4710 selectLargeAddAssignKernel( C, A, B );
4716 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4730 template<
typename MT3
4733 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4734 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4736 using ET = ElementType_t<MT3>;
4738 if( IsTriangular_v<MT4> ) {
4739 ResultType_t<MT3> tmp(
serial( B ) );
4740 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4741 addAssign( C, tmp );
4743 else if( IsTriangular_v<MT5> ) {
4744 ResultType_t<MT3> tmp(
serial( A ) );
4745 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4746 addAssign( C, tmp );
4749 gemm( C, A, B, ET(1), ET(1) );
4773 template<
typename MT
4775 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
4782 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4796 TDMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
4812 template<
typename MT3
4815 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4817 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
4818 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
4819 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
4820 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
4821 selectSmallSubAssignKernel( C, A, B );
4823 selectBlasSubAssignKernel( C, A, B );
4842 template<
typename MT3
4845 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4846 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4848 const size_t M( A.rows() );
4849 const size_t N( B.columns() );
4850 const size_t K( A.columns() );
4854 for(
size_t i=0UL; i<M; ++i )
4856 const size_t kbegin( ( IsUpper_v<MT4> )
4857 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4859 const size_t kend( ( IsLower_v<MT4> )
4860 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4864 for(
size_t k=kbegin; k<kend; ++k )
4866 const size_t jbegin( ( IsUpper_v<MT5> )
4867 ?( ( IsStrictlyUpper_v<MT5> )
4868 ?(
UPP ?
max(i,k+1UL) : k+1UL )
4869 :(
UPP ?
max(i,k) : k ) )
4870 :(
UPP ? i : 0UL ) );
4871 const size_t jend( ( IsLower_v<MT5> )
4872 ?( ( IsStrictlyLower_v<MT5> )
4873 ?(
LOW ?
min(i+1UL,k) : k )
4874 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
4875 :(
LOW ? i+1UL : N ) );
4877 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
4880 const size_t jnum( jend - jbegin );
4881 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
4883 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4884 C(i,j ) -= A(i,k) * B(k,j );
4885 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
4888 C(i,jpos) -= A(i,k) * B(k,jpos);
4910 template<
typename MT3
4913 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4914 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4916 const size_t M( A.rows() );
4917 const size_t N( B.columns() );
4918 const size_t K( A.columns() );
4922 for(
size_t j=0UL; j<N; ++j )
4924 const size_t kbegin( ( IsLower_v<MT5> )
4925 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4927 const size_t kend( ( IsUpper_v<MT5> )
4928 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4932 for(
size_t k=kbegin; k<kend; ++k )
4934 const size_t ibegin( ( IsLower_v<MT4> )
4935 ?( ( IsStrictlyLower_v<MT4> )
4936 ?(
LOW ?
max(j,k+1UL) : k+1UL )
4937 :(
LOW ?
max(j,k) : k ) )
4938 :(
LOW ? j : 0UL ) );
4939 const size_t iend( ( IsUpper_v<MT4> )
4940 ?( ( IsStrictlyUpper_v<MT4> )
4941 ?(
UPP ?
min(j+1UL,k) : k )
4942 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
4943 :(
UPP ? j+1UL : M ) );
4945 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
4948 const size_t inum( iend - ibegin );
4949 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
4951 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
4952 C(i ,j) -= A(i ,k) * B(k,j);
4953 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
4956 C(ipos,j) -= A(ipos,k) * B(k,j);
4978 template<
typename MT3
4981 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4982 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4984 constexpr
size_t block( BLOCK_SIZE );
4986 const size_t M( A.rows() );
4987 const size_t N( B.columns() );
4989 for(
size_t ii=0UL; ii<M; ii+=block ) {
4990 const size_t iend(
min( M, ii+block ) );
4991 for(
size_t jj=0UL; jj<N; jj+=block ) {
4992 const size_t jend(
min( N, jj+block ) );
4993 for(
size_t i=ii; i<iend; ++i )
4995 const size_t jbegin( ( IsUpper_v<MT4> )
4996 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
4998 const size_t jpos( ( IsLower_v<MT4> )
4999 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
5002 for(
size_t j=jbegin; j<jpos; ++j ) {
5003 C(i,j) -= A(i,j) * B(j,j);
5026 template<
typename MT3
5029 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5030 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5032 const size_t M( A.rows() );
5033 const size_t N( B.columns() );
5035 for(
size_t j=0UL; j<N; ++j )
5037 const size_t ibegin( ( IsLower_v<MT4> )
5038 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5040 const size_t iend( ( IsUpper_v<MT4> )
5041 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5045 const size_t inum( iend - ibegin );
5046 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
5048 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5049 C(i ,j) -= A(i ,j) * B(j,j);
5050 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
5053 C(ipos,j) -= A(ipos,j) * B(j,j);
5074 template<
typename MT3
5077 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5078 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5080 const size_t M( A.rows() );
5081 const size_t N( B.columns() );
5083 for(
size_t i=0UL; i<M; ++i )
5085 const size_t jbegin( ( IsUpper_v<MT5> )
5086 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5088 const size_t jend( ( IsLower_v<MT5> )
5089 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5093 const size_t jnum( jend - jbegin );
5094 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
5096 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5097 C(i,j ) -= A(i,i) * B(i,j );
5098 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
5101 C(i,jpos) -= A(i,i) * B(i,jpos);
5122 template<
typename MT3
5125 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5126 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5128 constexpr
size_t block( BLOCK_SIZE );
5130 const size_t M( A.rows() );
5131 const size_t N( B.columns() );
5133 for(
size_t jj=0UL; jj<N; jj+=block ) {
5134 const size_t jend(
min( N, jj+block ) );
5135 for(
size_t ii=0UL; ii<M; ii+=block ) {
5136 const size_t iend(
min( M, ii+block ) );
5137 for(
size_t j=jj; j<jend; ++j )
5139 const size_t ibegin( ( IsLower_v<MT5> )
5140 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
5142 const size_t ipos( ( IsUpper_v<MT5> )
5143 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
5146 for(
size_t i=ibegin; i<ipos; ++i ) {
5147 C(i,j) -= A(i,i) * B(i,j);
5170 template<
typename MT3
5173 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5174 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5176 for(
size_t i=0UL; i<A.rows(); ++i ) {
5177 C(i,i) -= A(i,i) * B(i,i);
5197 template<
typename MT3
5200 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5201 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5203 selectDefaultSubAssignKernel( C, A, B );
5223 template<
typename MT3
5226 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5227 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5229 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5231 const size_t M( A.rows() );
5232 const size_t N( B.columns() );
5233 const size_t K( A.columns() );
5237 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
5242 if( IsIntegral_v<ElementType> )
5245 for(
size_t i=0UL; i<M; ++i )
5247 const size_t kbegin( ( IsUpper_v<MT4> )
5248 ?( ( IsLower_v<MT5> )
5249 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5250 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5251 :( IsLower_v<MT5> ? j : 0UL ) );
5252 const size_t kend( ( IsLower_v<MT4> )
5253 ?( ( IsUpper_v<MT5> )
5254 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5255 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5256 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5267 for(
size_t k=kbegin; k<kend; ++k ) {
5269 xmm1 -= a1 * B.load(k,j );
5270 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5271 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5272 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5273 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5274 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
5275 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
5276 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
5279 C.store( i, j , xmm1 );
5281 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5282 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5283 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5284 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
5285 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
5286 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
5295 for( ; (i+2UL) <= M; i+=2UL )
5297 const size_t kbegin( ( IsUpper_v<MT4> )
5298 ?( ( IsLower_v<MT5> )
5299 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5300 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5301 :( IsLower_v<MT5> ? j : 0UL ) );
5302 const size_t kend( ( IsLower_v<MT4> )
5303 ?( ( IsUpper_v<MT5> )
5304 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5305 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5306 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5313 SIMDType xmm6 ( C.load(i+1UL,j ) );
5319 for(
size_t k=kbegin; k<kend; ++k ) {
5339 C.store( i , j , xmm1 );
5341 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5342 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5343 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
5344 C.store( i+1UL, j , xmm6 );
5345 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
5346 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
5347 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
5348 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
5353 const size_t kbegin( ( IsUpper_v<MT4> )
5354 ?( ( IsLower_v<MT5> )
5355 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5356 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5357 :( IsLower_v<MT5> ? j : 0UL ) );
5358 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5366 for(
size_t k=kbegin; k<kend; ++k ) {
5368 xmm1 -= a1 * B.load(k,j );
5369 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5370 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5371 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5372 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5375 C.store( i, j , xmm1 );
5377 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5378 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5379 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5387 for( ; (i+2UL) <= M; i+=2UL )
5389 const size_t kbegin( ( IsUpper_v<MT4> )
5390 ?( ( IsLower_v<MT5> )
5391 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5392 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5393 :( IsLower_v<MT5> ? j : 0UL ) );
5394 const size_t kend( ( IsLower_v<MT4> )
5395 ?( ( IsUpper_v<MT5> )
5396 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5397 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5398 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5409 for(
size_t k=kbegin; k<kend; ++k ) {
5426 C.store( i , j , xmm1 );
5428 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5429 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5430 C.store( i+1UL, j , xmm5 );
5431 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
5432 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
5433 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
5438 const size_t kbegin( ( IsUpper_v<MT4> )
5439 ?( ( IsLower_v<MT5> )
5440 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5441 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5442 :( IsLower_v<MT5> ? j : 0UL ) );
5443 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5450 for(
size_t k=kbegin; k<kend; ++k ) {
5452 xmm1 -= a1 * B.load(k,j );
5453 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5454 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5455 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5458 C.store( i, j , xmm1 );
5460 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5461 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5469 for( ; (i+2UL) <= M; i+=2UL )
5471 const size_t kbegin( ( IsUpper_v<MT4> )
5472 ?( ( IsLower_v<MT5> )
5473 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5474 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5475 :( IsLower_v<MT5> ? j : 0UL ) );
5476 const size_t kend( ( IsLower_v<MT4> )
5477 ?( ( IsUpper_v<MT5> )
5478 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5479 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5480 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5489 for(
size_t k=kbegin; k<kend; ++k ) {
5503 C.store( i , j , xmm1 );
5505 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5506 C.store( i+1UL, j , xmm4 );
5507 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
5508 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
5513 const size_t kbegin( ( IsUpper_v<MT4> )
5514 ?( ( IsLower_v<MT5> )
5515 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5516 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5517 :( IsLower_v<MT5> ? j : 0UL ) );
5518 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5524 for(
size_t k=kbegin; k<kend; ++k ) {
5526 xmm1 -= a1 * B.load(k,j );
5527 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5528 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5531 C.store( i, j , xmm1 );
5533 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5540 size_t i(
LOW ? j : 0UL );
5542 for( ; (i+4UL) <= iend; i+=4UL )
5544 const size_t kbegin( ( IsUpper_v<MT4> )
5545 ?( ( IsLower_v<MT5> )
5546 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5547 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5548 :( IsLower_v<MT5> ? j : 0UL ) );
5549 const size_t kend( ( IsLower_v<MT4> )
5550 ?( ( IsUpper_v<MT5> )
5551 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
5552 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5553 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5564 for(
size_t k=kbegin; k<kend; ++k ) {
5581 C.store( i , j , xmm1 );
5583 C.store( i+1UL, j , xmm3 );
5584 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5585 C.store( i+2UL, j , xmm5 );
5586 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5587 C.store( i+3UL, j , xmm7 );
5588 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
5591 for( ; (i+3UL) <= iend; i+=3UL )
5593 const size_t kbegin( ( IsUpper_v<MT4> )
5594 ?( ( IsLower_v<MT5> )
5595 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5596 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5597 :( IsLower_v<MT5> ? j : 0UL ) );
5598 const size_t kend( ( IsLower_v<MT4> )
5599 ?( ( IsUpper_v<MT5> )
5600 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
5601 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5602 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5611 for(
size_t k=kbegin; k<kend; ++k ) {
5625 C.store( i , j , xmm1 );
5627 C.store( i+1UL, j , xmm3 );
5628 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5629 C.store( i+2UL, j , xmm5 );
5630 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5633 for( ; (i+2UL) <= iend; i+=2UL )
5635 const size_t kbegin( ( IsUpper_v<MT4> )
5636 ?( ( IsLower_v<MT5> )
5637 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5638 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5639 :( IsLower_v<MT5> ? j : 0UL ) );
5640 const size_t kend( ( IsLower_v<MT4> )
5641 ?( ( IsUpper_v<MT5> )
5642 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
5643 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5644 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5653 for( ; (k+2UL) <= kend; k+=2UL ) {
5658 const SIMDType b1( B.load(k ,j ) );
5660 const SIMDType b3( B.load(k+1UL,j ) );
5672 for( ; k<kend; ++k ) {
5683 C.store( i , j , xmm1+xmm5 );
5684 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
5685 C.store( i+1UL, j , xmm3+xmm7 );
5686 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
5691 const size_t kbegin( ( IsUpper_v<MT4> )
5692 ?( ( IsLower_v<MT5> )
5693 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5694 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5695 :( IsLower_v<MT5> ? j : 0UL ) );
5696 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
5703 for( ; (k+2UL) <= kend; k+=2UL ) {
5706 xmm1 -= a1 * B.load(k ,j );
5707 xmm2 -= a1 * B.load(k ,j+
SIMDSIZE);
5708 xmm3 -= a2 * B.load(k+1UL,j );
5709 xmm4 -= a2 * B.load(k+1UL,j+
SIMDSIZE);
5712 for( ; k<kend; ++k ) {
5714 xmm1 -= a1 * B.load(k,j );
5718 C.store( i, j , xmm1+xmm3 );
5719 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
5726 size_t i(
LOW ? j : 0UL );
5728 for( ; (i+4UL) <= iend; i+=4UL )
5730 const size_t kbegin( ( IsUpper_v<MT4> )
5731 ?( ( IsLower_v<MT5> )
5732 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5733 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5734 :( IsLower_v<MT5> ? j : 0UL ) );
5735 const size_t kend( ( IsLower_v<MT4> )
5736 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5746 for( ; (k+2UL) <= kend; k+=2UL ) {
5748 const SIMDType b2( B.load(k+1UL,j) );
5749 xmm1 -=
set( A(i ,k ) ) * b1;
5750 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5751 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5752 xmm4 -=
set( A(i+3UL,k ) ) * b1;
5753 xmm5 -=
set( A(i ,k+1UL) ) * b2;
5754 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
5755 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
5756 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
5759 for( ; k<kend; ++k ) {
5761 xmm1 -=
set( A(i ,k) ) * b1;
5762 xmm2 -=
set( A(i+1UL,k) ) * b1;
5763 xmm3 -=
set( A(i+2UL,k) ) * b1;
5764 xmm4 -=
set( A(i+3UL,k) ) * b1;
5767 C.store( i , j, xmm1+xmm5 );
5768 C.store( i+1UL, j, xmm2+xmm6 );
5769 C.store( i+2UL, j, xmm3+xmm7 );
5770 C.store( i+3UL, j, xmm4+xmm8 );
5773 for( ; (i+3UL) <= iend; i+=3UL )
5775 const size_t kbegin( ( IsUpper_v<MT4> )
5776 ?( ( IsLower_v<MT5> )
5777 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5778 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5779 :( IsLower_v<MT5> ? j : 0UL ) );
5780 const size_t kend( ( IsLower_v<MT4> )
5781 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5790 for( ; (k+2UL) <= kend; k+=2UL ) {
5792 const SIMDType b2( B.load(k+1UL,j) );
5793 xmm1 -=
set( A(i ,k ) ) * b1;
5794 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5795 xmm3 -=
set( A(i+2UL,k ) ) * b1;
5796 xmm4 -=
set( A(i ,k+1UL) ) * b2;
5797 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
5798 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
5801 for( ; k<kend; ++k ) {
5803 xmm1 -=
set( A(i ,k) ) * b1;
5804 xmm2 -=
set( A(i+1UL,k) ) * b1;
5805 xmm3 -=
set( A(i+2UL,k) ) * b1;
5808 C.store( i , j, xmm1+xmm4 );
5809 C.store( i+1UL, j, xmm2+xmm5 );
5810 C.store( i+2UL, j, xmm3+xmm6 );
5813 for( ; (i+2UL) <= iend; i+=2UL )
5815 const size_t kbegin( ( IsUpper_v<MT4> )
5816 ?( ( IsLower_v<MT5> )
5817 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5818 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5819 :( IsLower_v<MT5> ? j : 0UL ) );
5820 const size_t kend( ( IsLower_v<MT4> )
5821 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5829 for( ; (k+2UL) <= kend; k+=2UL ) {
5831 const SIMDType b2( B.load(k+1UL,j) );
5832 xmm1 -=
set( A(i ,k ) ) * b1;
5833 xmm2 -=
set( A(i+1UL,k ) ) * b1;
5834 xmm3 -=
set( A(i ,k+1UL) ) * b2;
5835 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
5838 for( ; k<kend; ++k ) {
5840 xmm1 -=
set( A(i ,k) ) * b1;
5841 xmm2 -=
set( A(i+1UL,k) ) * b1;
5844 C.store( i , j, xmm1+xmm3 );
5845 C.store( i+1UL, j, xmm2+xmm4 );
5850 const size_t kbegin( ( IsUpper_v<MT4> )
5851 ?( ( IsLower_v<MT5> )
5852 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5853 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5854 :( IsLower_v<MT5> ? j : 0UL ) );
5860 for( ; (k+2UL) <= K; k+=2UL ) {
5861 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
5862 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5866 xmm1 -=
set( A(i,k) ) * B.load(k,j);
5869 C.store( i, j, xmm1+xmm2 );
5873 for( ; remainder && j<N; ++j )
5875 const size_t iend(
UPP ? j+1UL : M );
5876 size_t i(
LOW ? j : 0UL );
5878 for( ; (i+2UL) <= iend; i+=2UL )
5880 const size_t kbegin( ( IsUpper_v<MT4> )
5881 ?( ( IsLower_v<MT5> )
5882 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5883 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5884 :( IsLower_v<MT5> ? j : 0UL ) );
5885 const size_t kend( ( IsLower_v<MT4> )
5886 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5892 for(
size_t k=kbegin; k<kend; ++k ) {
5893 value1 -= A(i ,k) * B(k,j);
5894 value2 -= A(i+1UL,k) * B(k,j);
5898 C(i+1UL,j) = value2;
5903 const size_t kbegin( ( IsUpper_v<MT4> )
5904 ?( ( IsLower_v<MT5> )
5905 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5906 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5907 :( IsLower_v<MT5> ? j : 0UL ) );
5911 for(
size_t k=kbegin; k<K; ++k ) {
5912 value -= A(i,k) * B(k,j);
5937 template<
typename MT3
5940 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5941 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5943 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5945 const size_t M( A.rows() );
5946 const size_t N( B.columns() );
5947 const size_t K( A.columns() );
5951 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
5956 if( IsIntegral_v<ElementType> )
5959 for(
size_t j=0UL; j<N; ++j )
5961 const size_t kbegin( ( IsLower_v<MT5> )
5962 ?( ( IsUpper_v<MT4> )
5963 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5964 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5965 :( IsUpper_v<MT4> ? i : 0UL ) );
5966 const size_t kend( ( IsUpper_v<MT5> )
5967 ?( ( IsLower_v<MT4> )
5968 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5969 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5970 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
5981 for(
size_t k=kbegin; k<kend; ++k ) {
5983 xmm1 -= A.load(i ,k) * b1;
5984 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
5985 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
5986 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
5987 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
5988 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
5989 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
5990 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
5993 C.store( i , j, xmm1 );
5995 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
5996 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
5997 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
5998 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
5999 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
6000 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
6009 for( ; (j+2UL) <= N; j+=2UL )
6011 const size_t kbegin( ( IsLower_v<MT5> )
6012 ?( ( IsUpper_v<MT4> )
6013 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6014 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6015 :( IsUpper_v<MT4> ? i : 0UL ) );
6016 const size_t kend( ( IsUpper_v<MT5> )
6017 ?( ( IsLower_v<MT4> )
6018 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6019 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6020 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
6027 SIMDType xmm6 ( C.load(i ,j+1UL) );
6033 for(
size_t k=kbegin; k<kend; ++k ) {
6053 C.store( i , j , xmm1 );
6055 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6056 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
6057 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
6058 C.store( i , j+1UL, xmm6 );
6059 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
6060 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
6061 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
6062 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
6067 const size_t kbegin( ( IsLower_v<MT5> )
6068 ?( ( IsUpper_v<MT4> )
6069 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6070 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6071 :( IsUpper_v<MT4> ? i : 0UL ) );
6072 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
6080 for(
size_t k=kbegin; k<kend; ++k ) {
6082 xmm1 -= A.load(i ,k) * b1;
6083 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6084 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6085 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
6086 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
6089 C.store( i , j, xmm1 );
6091 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6092 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6093 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
6101 for( ; (j+2UL) <= N; j+=2UL )
6103 const size_t kbegin( ( IsLower_v<MT5> )
6104 ?( ( IsUpper_v<MT4> )
6105 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6106 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6107 :( IsUpper_v<MT4> ? i : 0UL ) );
6108 const size_t kend( ( IsUpper_v<MT5> )
6109 ?( ( IsLower_v<MT4> )
6110 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6111 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6112 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
6123 for(
size_t k=kbegin; k<kend; ++k ) {
6140 C.store( i , j , xmm1 );
6142 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6143 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
6144 C.store( i , j+1UL, xmm5 );
6145 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
6146 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
6147 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
6152 const size_t kbegin( ( IsLower_v<MT5> )
6153 ?( ( IsUpper_v<MT4> )
6154 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6155 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6156 :( IsUpper_v<MT4> ? i : 0UL ) );
6157 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
6164 for(
size_t k=kbegin; k<kend; ++k ) {
6166 xmm1 -= A.load(i ,k) * b1;
6167 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6168 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6169 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
6172 C.store( i , j, xmm1 );
6174 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6175 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6183 for( ; (j+2UL) <= N; j+=2UL )
6185 const size_t kbegin( ( IsLower_v<MT5> )
6186 ?( ( IsUpper_v<MT4> )
6187 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6188 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6189 :( IsUpper_v<MT4> ? i : 0UL ) );
6190 const size_t kend( ( IsUpper_v<MT5> )
6191 ?( ( IsLower_v<MT4> )
6192 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6193 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6194 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
6203 for(
size_t k=kbegin; k<kend; ++k ) {
6217 C.store( i , j , xmm1 );
6219 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6220 C.store( i , j+1UL, xmm4 );
6221 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
6222 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
6227 const size_t kbegin( ( IsLower_v<MT5> )
6228 ?( ( IsUpper_v<MT4> )
6229 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6230 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6231 :( IsUpper_v<MT4> ? i : 0UL ) );
6232 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
6238 for(
size_t k=kbegin; k<kend; ++k ) {
6240 xmm1 -= A.load(i ,k) * b1;
6241 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6242 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6245 C.store( i , j, xmm1 );
6247 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6254 size_t j(
UPP ? i : 0UL );
6256 for( ; (j+4UL) <= jend; j+=4UL )
6258 const size_t kbegin( ( IsLower_v<MT5> )
6259 ?( ( IsUpper_v<MT4> )
6260 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6261 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6262 :( IsUpper_v<MT4> ? i : 0UL ) );
6263 const size_t kend( ( IsUpper_v<MT5> )
6264 ?( ( IsLower_v<MT4> )
6265 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6266 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6267 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6278 for(
size_t k=kbegin; k<kend; ++k ) {
6295 C.store( i , j , xmm1 );
6297 C.store( i , j+1UL, xmm3 );
6298 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6299 C.store( i , j+2UL, xmm5 );
6300 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6301 C.store( i , j+3UL, xmm7 );
6302 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
6305 for( ; (j+3UL) <= jend; j+=3UL )
6307 const size_t kbegin( ( IsLower_v<MT5> )
6308 ?( ( IsUpper_v<MT4> )
6309 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6310 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6311 :( IsUpper_v<MT4> ? i : 0UL ) );
6312 const size_t kend( ( IsUpper_v<MT5> )
6313 ?( ( IsLower_v<MT4> )
6314 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6315 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6316 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6325 for(
size_t k=kbegin; k<kend; ++k ) {
6339 C.store( i , j , xmm1 );
6341 C.store( i , j+1UL, xmm3 );
6342 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6343 C.store( i , j+2UL, xmm5 );
6344 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6347 for( ; (j+2UL) <= jend; j+=2UL )
6349 const size_t kbegin( ( IsLower_v<MT5> )
6350 ?( ( IsUpper_v<MT4> )
6351 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6352 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6353 :( IsUpper_v<MT4> ? i : 0UL ) );
6354 const size_t kend( ( IsUpper_v<MT5> )
6355 ?( ( IsLower_v<MT4> )
6356 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6357 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6358 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6367 for( ; (k+2UL) <= kend; k+=2UL ) {
6368 const SIMDType a1( A.load(i ,k ) );
6370 const SIMDType a3( A.load(i ,k+1UL) );
6386 for( ; k<kend; ++k ) {
6397 C.store( i , j , xmm1+xmm5 );
6398 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
6399 C.store( i , j+1UL, xmm3+xmm7 );
6400 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
6405 const size_t kbegin( ( IsLower_v<MT5> )
6406 ?( ( IsUpper_v<MT4> )
6407 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6408 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6409 :( IsUpper_v<MT4> ? i : 0UL ) );
6410 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
6417 for( ; (k+2UL) <= kend; k+=2UL ) {
6420 xmm1 -= A.load(i ,k ) * b1;
6421 xmm2 -= A.load(i+
SIMDSIZE,k ) * b1;
6422 xmm3 -= A.load(i ,k+1UL) * b2;
6423 xmm4 -= A.load(i+
SIMDSIZE,k+1UL) * b2;
6426 for( ; k<kend; ++k ) {
6428 xmm1 -= A.load(i ,k) * b1;
6432 C.store( i , j, xmm1+xmm3 );
6433 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
6440 size_t j(
UPP ? i : 0UL );
6442 for( ; (j+4UL) <= jend; j+=4UL )
6444 const size_t kbegin( ( IsLower_v<MT5> )
6445 ?( ( IsUpper_v<MT4> )
6446 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6447 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6448 :( IsUpper_v<MT4> ? i : 0UL ) );
6449 const size_t kend( ( IsUpper_v<MT5> )
6450 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6460 for( ; (k+2UL) <= kend; k+=2UL ) {
6462 const SIMDType a2( A.load(i,k+1UL) );
6463 xmm1 -= a1 *
set( B(k ,j ) );
6464 xmm2 -= a1 *
set( B(k ,j+1UL) );
6465 xmm3 -= a1 *
set( B(k ,j+2UL) );
6466 xmm4 -= a1 *
set( B(k ,j+3UL) );
6467 xmm5 -= a2 *
set( B(k+1UL,j ) );
6468 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
6469 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
6470 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
6473 for( ; k<kend; ++k ) {
6475 xmm1 -= a1 *
set( B(k,j ) );
6476 xmm2 -= a1 *
set( B(k,j+1UL) );
6477 xmm3 -= a1 *
set( B(k,j+2UL) );
6478 xmm4 -= a1 *
set( B(k,j+3UL) );
6481 C.store( i, j , xmm1+xmm5 );
6482 C.store( i, j+1UL, xmm2+xmm6 );
6483 C.store( i, j+2UL, xmm3+xmm7 );
6484 C.store( i, j+3UL, xmm4+xmm8 );
6487 for( ; (j+3UL) <= jend; j+=3UL )
6489 const size_t kbegin( ( IsLower_v<MT5> )
6490 ?( ( IsUpper_v<MT4> )
6491 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6492 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6493 :( IsUpper_v<MT4> ? i : 0UL ) );
6494 const size_t kend( ( IsUpper_v<MT5> )
6495 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6504 for( ; (k+2UL) <= kend; k+=2UL ) {
6506 const SIMDType a2( A.load(i,k+1UL) );
6507 xmm1 -= a1 *
set( B(k ,j ) );
6508 xmm2 -= a1 *
set( B(k ,j+1UL) );
6509 xmm3 -= a1 *
set( B(k ,j+2UL) );
6510 xmm4 -= a2 *
set( B(k+1UL,j ) );
6511 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
6512 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
6515 for( ; k<kend; ++k ) {
6517 xmm1 -= a1 *
set( B(k,j ) );
6518 xmm2 -= a1 *
set( B(k,j+1UL) );
6519 xmm3 -= a1 *
set( B(k,j+2UL) );
6522 C.store( i, j , xmm1+xmm4 );
6523 C.store( i, j+1UL, xmm2+xmm5 );
6524 C.store( i, j+2UL, xmm3+xmm6 );
6527 for( ; (j+2UL) <= jend; j+=2UL )
6529 const size_t kbegin( ( IsLower_v<MT5> )
6530 ?( ( IsUpper_v<MT4> )
6531 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6532 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6533 :( IsUpper_v<MT4> ? i : 0UL ) );
6534 const size_t kend( ( IsUpper_v<MT5> )
6535 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6543 for( ; (k+2UL) <= kend; k+=2UL ) {
6545 const SIMDType a2( A.load(i,k+1UL) );
6546 xmm1 -= a1 *
set( B(k ,j ) );
6547 xmm2 -= a1 *
set( B(k ,j+1UL) );
6548 xmm3 -= a2 *
set( B(k+1UL,j ) );
6549 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
6552 for( ; k<kend; ++k ) {
6554 xmm1 -= a1 *
set( B(k,j ) );
6555 xmm2 -= a1 *
set( B(k,j+1UL) );
6558 C.store( i, j , xmm1+xmm3 );
6559 C.store( i, j+1UL, xmm2+xmm4 );
6564 const size_t kbegin( ( IsLower_v<MT5> )
6565 ?( ( IsUpper_v<MT4> )
6566 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6567 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6568 :( IsUpper_v<MT4> ? i : 0UL ) );
6574 for( ; (k+2UL) <= K; k+=2UL ) {
6575 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
6576 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
6580 xmm1 -= A.load(i,k) *
set( B(k,j) );
6583 C.store( i, j, xmm1+xmm2 );
6587 for( ; remainder && i<M; ++i )
6589 const size_t jend(
LOW ? i+1UL : N );
6590 size_t j(
UPP ? i : 0UL );
6592 for( ; (j+2UL) <= jend; j+=2UL )
6594 const size_t kbegin( ( IsLower_v<MT5> )
6595 ?( ( IsUpper_v<MT4> )
6596 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6597 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6598 :( IsUpper_v<MT4> ? i : 0UL ) );
6599 const size_t kend( ( IsUpper_v<MT5> )
6600 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6606 for(
size_t k=kbegin; k<kend; ++k ) {
6607 value1 -= A(i,k) * B(k,j );
6608 value2 -= A(i,k) * B(k,j+1UL);
6612 C(i,j+1UL) = value2;
6617 const size_t kbegin( ( IsLower_v<MT5> )
6618 ?( ( IsUpper_v<MT4> )
6619 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6620 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6621 :( IsUpper_v<MT4> ? i : 0UL ) );
6625 for(
size_t k=kbegin; k<K; ++k ) {
6626 value -= A(i,k) * B(k,j);
6650 template<
typename MT3
6653 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6654 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6656 selectDefaultSubAssignKernel( C, A, B );
6676 template<
typename MT3
6679 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6680 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6706 template<
typename MT3
6709 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6710 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6712 selectLargeSubAssignKernel( C, A, B );
6718 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6732 template<
typename MT3
6735 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6736 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6738 using ET = ElementType_t<MT3>;
6740 if( IsTriangular_v<MT4> ) {
6741 ResultType_t<MT3> tmp(
serial( B ) );
6742 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6743 subAssign( C, tmp );
6745 else if( IsTriangular_v<MT5> ) {
6746 ResultType_t<MT3> tmp(
serial( A ) );
6747 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6748 subAssign( C, tmp );
6751 gemm( C, A, B, ET(-1), ET(1) );
6775 template<
typename MT
6777 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
6789 schurAssign( ~lhs, tmp );
6822 template<
typename MT
6825 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6832 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
6835 else if( rhs.lhs_.columns() == 0UL ) {
6871 template<
typename MT
6874 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6878 using TmpType = If_t< SO, ResultType, OppositeType >;
6890 const ForwardFunctor fwd;
6892 const TmpType tmp( rhs );
6914 template<
typename MT
6917 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6924 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6963 template<
typename MT
6966 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6973 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
7009 template<
typename MT
7069 template<
typename MT1
7076 class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
7077 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
7078 ,
private Computation
7083 using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7085 using RES = ResultType_t<MMM>;
7086 using RT1 = ResultType_t<MT1>;
7087 using RT2 = ResultType_t<MT2>;
7088 using ET1 = ElementType_t<RT1>;
7089 using ET2 = ElementType_t<RT2>;
7090 using CT1 = CompositeType_t<MT1>;
7091 using CT2 = CompositeType_t<MT2>;
7096 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
7101 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
7105 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
7106 static constexpr
bool HERM = ( HF && !( LF || UF ) );
7107 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
7108 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
7116 template<
typename T1,
typename T2,
typename T3 >
7117 static constexpr
bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
7124 template<
typename T1,
typename T2,
typename T3,
typename T4 >
7125 static constexpr
bool UseBlasKernel_v =
7127 !SYM && !HERM && !LOW && !UPP &&
7128 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
7129 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
7130 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
7131 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
7132 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7133 IsBLASCompatible_v< ElementType_t<T1> > &&
7134 IsBLASCompatible_v< ElementType_t<T2> > &&
7135 IsBLASCompatible_v< ElementType_t<T3> > &&
7136 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
7137 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
7138 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
7145 template<
typename T1,
typename T2,
typename T3,
typename T4 >
7146 static constexpr
bool UseVectorizedDefaultKernel_v =
7147 ( useOptimizedKernels &&
7148 !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
7149 !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
7150 !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
7151 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7156 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
7164 using ForwardFunctor =
If_t< HERM
7180 using This = DMatScalarMultExpr<MMM,ST,true>;
7183 using BaseType = DenseMatrix<This,true>;
7187 , DeclHermTrait< MultTrait_t<RES,ST> >
7189 , DeclSymTrait< MultTrait_t<RES,ST> >
7192 , DeclDiagTrait< MultTrait_t<RES,ST> >
7193 , DeclLowTrait< MultTrait_t<RES,ST> > >
7195 , DeclUppTrait< MultTrait_t<RES,ST> >
7196 , MultTrait<RES,ST> > > > >::Type;
7201 using SIMDType = SIMDTrait_t<ElementType>;
7206 using LeftOperand =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7212 using LT = If_t< evaluateLeft, const RT1, CT1 >;
7215 using RT = If_t< evaluateRight, const RT2, CT2 >;
7221 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7222 MT1::simdEnabled && MT2::simdEnabled &&
7223 IsSIMDCombinable_v<ET1,ET2,ST> &&
7224 HasSIMDAdd_v<ET1,ET2> &&
7225 HasSIMDMult_v<ET1,ET2> );
7229 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
7275 if( j >=
matrix_.columns() ) {
7278 return (*
this)(i,j);
7287 inline size_t rows()
const {
7297 inline size_t columns()
const {
7328 template<
typename T >
7329 inline bool canAlias(
const T* alias )
const {
7330 return matrix_.canAlias( alias );
7340 template<
typename T >
7341 inline bool isAliased(
const T* alias )
const {
7342 return matrix_.isAliased( alias );
7363 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7365 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7366 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7388 template<
typename MT
7397 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7398 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7400 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7403 else if( left.columns() == 0UL ) {
7418 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
7433 template<
typename MT3
7437 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7439 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7440 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
7441 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
7442 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7443 selectSmallAssignKernel( C, A, B, scalar );
7445 selectBlasAssignKernel( C, A, B, scalar );
7463 template<
typename MT3
7467 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7468 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7470 const size_t M( A.rows() );
7471 const size_t N( B.columns() );
7472 const size_t K( A.columns() );
7476 for(
size_t i=0UL; i<M; ++i )
7478 const size_t kbegin( ( IsUpper_v<MT4> )
7479 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7481 const size_t kend( ( IsLower_v<MT4> )
7482 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7486 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7487 for(
size_t j=0UL; j<N; ++j ) {
7494 const size_t jbegin( ( IsUpper_v<MT5> )
7495 ?( ( IsStrictlyUpper_v<MT5> )
7496 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
7497 :( UPP ?
max(i,kbegin) : kbegin ) )
7498 :( UPP ? i : 0UL ) );
7499 const size_t jend( ( IsLower_v<MT5> )
7500 ?( ( IsStrictlyLower_v<MT5> )
7501 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
7502 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
7503 :( LOW ? i+1UL : N ) );
7505 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7506 for(
size_t j=0UL; j<jbegin; ++j ) {
7510 else if( IsStrictlyUpper_v<MT5> ) {
7513 for(
size_t j=jbegin; j<jend; ++j ) {
7514 C(i,j) = A(i,kbegin) * B(kbegin,j);
7516 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7517 for(
size_t j=jend; j<N; ++j ) {
7521 else if( IsStrictlyLower_v<MT5> ) {
7522 reset( C(i,N-1UL) );
7526 for(
size_t k=kbegin+1UL; k<kend; ++k )
7528 const size_t jbegin( ( IsUpper_v<MT5> )
7529 ?( ( IsStrictlyUpper_v<MT5> )
7530 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
7531 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
7532 :( SYM || HERM || UPP ? i : 0UL ) );
7533 const size_t jend( ( IsLower_v<MT5> )
7534 ?( ( IsStrictlyLower_v<MT5> )
7535 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
7536 :( LOW ?
min(i+1UL,k) : k ) )
7537 :( LOW ? i+1UL : N ) );
7539 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7542 for(
size_t j=jbegin; j<jend; ++j ) {
7543 C(i,j) += A(i,k) * B(k,j);
7545 if( IsLower_v<MT5> ) {
7546 C(i,jend) = A(i,k) * B(k,jend);
7551 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7552 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7553 :( SYM || HERM || UPP ? i : 0UL ) );
7554 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7555 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7556 :( LOW ? i+1UL : N ) );
7558 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7561 for(
size_t j=jbegin; j<jend; ++j ) {
7568 for(
size_t i=1UL; i<M; ++i ) {
7569 for(
size_t j=0UL; j<i; ++j ) {
7570 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7591 template<
typename MT3
7595 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7596 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7598 const size_t M( A.rows() );
7599 const size_t N( B.columns() );
7600 const size_t K( A.columns() );
7604 for(
size_t j=0UL; j<N; ++j )
7606 const size_t kbegin( ( IsLower_v<MT5> )
7607 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7609 const size_t kend( ( IsUpper_v<MT5> )
7610 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7614 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7615 for(
size_t i=0UL; i<M; ++i ) {
7622 const size_t ibegin( ( IsLower_v<MT4> )
7623 ?( ( IsStrictlyLower_v<MT4> )
7624 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
7625 :( LOW ?
max(j,kbegin) : kbegin ) )
7626 :( LOW ? j : 0UL ) );
7627 const size_t iend( ( IsUpper_v<MT4> )
7628 ?( ( IsStrictlyUpper_v<MT4> )
7629 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
7630 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
7631 :( UPP ? j+1UL : M ) );
7633 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7634 for(
size_t i=0UL; i<ibegin; ++i ) {
7638 else if( IsStrictlyLower_v<MT4> ) {
7641 for(
size_t i=ibegin; i<iend; ++i ) {
7642 C(i,j) = A(i,kbegin) * B(kbegin,j);
7644 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7645 for(
size_t i=iend; i<M; ++i ) {
7649 else if( IsStrictlyUpper_v<MT4> ) {
7650 reset( C(M-1UL,j) );
7654 for(
size_t k=kbegin+1UL; k<kend; ++k )
7656 const size_t ibegin( ( IsLower_v<MT4> )
7657 ?( ( IsStrictlyLower_v<MT4> )
7658 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
7659 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
7660 :( SYM || HERM || LOW ? j : 0UL ) );
7661 const size_t iend( ( IsUpper_v<MT4> )
7662 ?( ( IsStrictlyUpper_v<MT4> )
7663 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
7664 :( UPP ?
min(j+1UL,k) : k ) )
7665 :( UPP ? j+1UL : M ) );
7667 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7670 for(
size_t i=ibegin; i<iend; ++i ) {
7671 C(i,j) += A(i,k) * B(k,j);
7673 if( IsUpper_v<MT4> ) {
7674 C(iend,j) = A(iend,k) * B(k,j);
7679 const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7680 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7681 :( SYM || HERM || LOW ? j : 0UL ) );
7682 const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7683 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7684 :( UPP ? j+1UL : M ) );
7686 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7689 for(
size_t i=ibegin; i<iend; ++i ) {
7696 for(
size_t j=1UL; j<N; ++j ) {
7697 for(
size_t i=0UL; i<j; ++i ) {
7698 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7719 template<
typename MT3
7723 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7724 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7726 constexpr
size_t block( BLOCK_SIZE );
7728 const size_t M( A.rows() );
7729 const size_t N( B.columns() );
7731 for(
size_t ii=0UL; ii<M; ii+=block ) {
7732 const size_t iend(
min( M, ii+block ) );
7733 for(
size_t jj=0UL; jj<N; jj+=block ) {
7734 const size_t jend(
min( N, jj+block ) );
7735 for(
size_t i=ii; i<iend; ++i )
7737 const size_t jbegin( ( IsUpper_v<MT4> )
7738 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7740 const size_t jpos( ( IsLower_v<MT4> )
7741 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7744 if( IsUpper_v<MT4> ) {
7745 for(
size_t j=jj; j<jbegin; ++j ) {
7749 for(
size_t j=jbegin; j<jpos; ++j ) {
7750 C(i,j) = A(i,j) * B(j,j) * scalar;
7752 if( IsLower_v<MT4> ) {
7753 for(
size_t j=jpos; j<jend; ++j ) {
7777 template<
typename MT3
7781 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7782 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7784 const size_t M( A.rows() );
7785 const size_t N( B.columns() );
7787 for(
size_t j=0UL; j<N; ++j )
7789 const size_t ibegin( ( IsLower_v<MT4> )
7790 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7792 const size_t iend( ( IsUpper_v<MT4> )
7793 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7797 if( IsLower_v<MT4> ) {
7798 for(
size_t i=0UL; i<ibegin; ++i ) {
7802 for(
size_t i=ibegin; i<iend; ++i ) {
7803 C(i,j) = A(i,j) * B(j,j) * scalar;
7805 if( IsUpper_v<MT4> ) {
7806 for(
size_t i=iend; i<M; ++i ) {
7828 template<
typename MT3
7832 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7833 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7835 const size_t M( A.rows() );
7836 const size_t N( B.columns() );
7838 for(
size_t i=0UL; i<M; ++i )
7840 const size_t jbegin( ( IsUpper_v<MT5> )
7841 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7843 const size_t jend( ( IsLower_v<MT5> )
7844 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7848 if( IsUpper_v<MT5> ) {
7849 for(
size_t j=0UL; j<jbegin; ++j ) {
7853 for(
size_t j=jbegin; j<jend; ++j ) {
7854 C(i,j) = A(i,i) * B(i,j) * scalar;
7856 if( IsLower_v<MT5> ) {
7857 for(
size_t j=jend; j<N; ++j ) {
7879 template<
typename MT3
7883 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7884 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7886 constexpr
size_t block( BLOCK_SIZE );
7888 const size_t M( A.rows() );
7889 const size_t N( B.columns() );
7891 for(
size_t jj=0UL; jj<N; jj+=block ) {
7892 const size_t jend(
min( N, jj+block ) );
7893 for(
size_t ii=0UL; ii<M; ii+=block ) {
7894 const size_t iend(
min( M, ii+block ) );
7895 for(
size_t j=jj; j<jend; ++j )
7897 const size_t ibegin( ( IsLower_v<MT5> )
7898 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
7900 const size_t ipos( ( IsUpper_v<MT5> )
7901 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
7904 if( IsLower_v<MT5> ) {
7905 for(
size_t i=ii; i<ibegin; ++i ) {
7909 for(
size_t i=ibegin; i<ipos; ++i ) {
7910 C(i,j) = A(i,i) * B(i,j) * scalar;
7912 if( IsUpper_v<MT5> ) {
7913 for(
size_t i=ipos; i<iend; ++i ) {
7937 template<
typename MT3
7941 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7942 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7946 for(
size_t i=0UL; i<A.rows(); ++i ) {
7947 C(i,i) = A(i,i) * B(i,i) * scalar;
7966 template<
typename MT3
7970 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7971 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7973 selectDefaultAssignKernel( C, A, B, scalar );
7992 template<
typename MT3
7996 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7997 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7999 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8001 const size_t M( A.rows() );
8002 const size_t N( B.columns() );
8003 const size_t K( A.columns() );
8007 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
8010 const SIMDType factor(
set( scalar ) );
8014 if( IsIntegral_v<ElementType> )
8016 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
8017 for(
size_t i=0UL; i<M; ++i )
8019 const size_t kbegin( ( IsUpper_v<MT4> )
8020 ?( ( IsLower_v<MT5> )
8021 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8022 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8023 :( IsLower_v<MT5> ? j : 0UL ) );
8024 const size_t kend( ( IsLower_v<MT4> )
8025 ?( ( IsUpper_v<MT5> )
8026 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
8027 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8028 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
8030 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8032 for(
size_t k=kbegin; k<kend; ++k ) {
8033 const SIMDType a1(
set( A(i,k) ) );
8034 xmm1 += a1 * B.load(k,j );
8035 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8036 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8037 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8038 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8039 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
8040 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
8041 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
8044 C.store( i, j , xmm1 * factor );
8045 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8046 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8047 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8048 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
8049 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
8050 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
8051 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
8056 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
8060 for( ; (i+2UL) <= M; i+=2UL )
8062 const size_t kbegin( ( IsUpper_v<MT4> )
8063 ?( ( IsLower_v<MT5> )
8064 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8065 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8066 :( IsLower_v<MT5> ? j : 0UL ) );
8067 const size_t kend( ( IsLower_v<MT4> )
8068 ?( ( IsUpper_v<MT5> )
8069 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
8070 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8071 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
8073 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8075 for(
size_t k=kbegin; k<kend; ++k ) {
8076 const SIMDType a1(
set( A(i ,k) ) );
8077 const SIMDType a2(
set( A(i+1UL,k) ) );
8078 const SIMDType b1( B.load(k,j ) );
8079 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8080 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8081 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8082 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
8095 C.store( i , j , xmm1 * factor );
8096 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8097 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8098 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
8099 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
8100 C.store( i+1UL, j , xmm6 * factor );
8101 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
8102 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
8103 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
8104 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
8109 const size_t kbegin( ( IsUpper_v<MT4> )
8110 ?( ( IsLower_v<MT5> )
8111 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8112 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8113 :( IsLower_v<MT5> ? j : 0UL ) );
8114 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
8116 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8118 for(
size_t k=kbegin; k<kend; ++k ) {
8119 const SIMDType a1(
set( A(i,k) ) );
8120 xmm1 += a1 * B.load(k,j );
8121 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8122 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8123 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8124 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8127 C.store( i, j , xmm1 * factor );
8128 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8129 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8130 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8131 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
8137 const size_t iend( UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
8143 for(
size_t jj=j; jj<jjend; ++jj ) {
8144 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8151 for(
size_t jj=j; jj<jjend; ++jj ) {
8157 for( ; (i+2UL) <= iend; i+=2UL )
8159 const size_t kbegin( ( IsUpper_v<MT4> )
8160 ?( ( IsLower_v<MT5> )
8161 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8162 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8163 :( IsLower_v<MT5> ? j : 0UL ) );
8164 const size_t kend( ( IsLower_v<MT4> )
8165 ?( ( IsUpper_v<MT5> )
8166 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
8167 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8168 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
8170 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8172 for(
size_t k=kbegin; k<kend; ++k ) {
8173 const SIMDType a1(
set( A(i ,k) ) );
8174 const SIMDType a2(
set( A(i+1UL,k) ) );
8175 const SIMDType b1( B.load(k,j ) );
8176 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8177 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8178 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8189 C.store( i , j , xmm1 * factor );
8190 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8191 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8192 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
8193 C.store( i+1UL, j , xmm5 * factor );
8194 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
8195 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
8196 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
8201 const size_t kbegin( ( IsUpper_v<MT4> )
8202 ?( ( IsLower_v<MT5> )
8203 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8204 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8205 :( IsLower_v<MT5> ? j : 0UL ) );
8206 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
8208 SIMDType xmm1, xmm2, xmm3, xmm4;
8210 for(
size_t k=kbegin; k<kend; ++k ) {
8211 const SIMDType a1(
set( A(i,k) ) );
8212 xmm1 += a1 * B.load(k,j );
8213 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8214 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8215 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8218 C.store( i, j , xmm1 * factor );
8219 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8220 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8221 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8229 for(
size_t jj=j; jj<jjend; ++jj ) {
8238 const size_t iend( UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
8244 for(
size_t jj=j; jj<jjend; ++jj ) {
8245 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8252 for(
size_t jj=j; jj<jjend; ++jj ) {
8258 for( ; (i+2UL) <= iend; i+=2UL )
8260 const size_t kbegin( ( IsUpper_v<MT4> )
8261 ?( ( IsLower_v<MT5> )
8262 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8263 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8264 :( IsLower_v<MT5> ? j : 0UL ) );
8265 const size_t kend( ( IsLower_v<MT4> )
8266 ?( ( IsUpper_v<MT5> )
8267 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
8268 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8269 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
8271 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8273 for(
size_t k=kbegin; k<kend; ++k ) {
8274 const SIMDType a1(
set( A(i ,k) ) );
8275 const SIMDType a2(
set( A(i+1UL,k) ) );
8276 const SIMDType b1( B.load(k,j ) );
8277 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8278 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8287 C.store( i , j , xmm1 * factor );
8288 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8289 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8290 C.store( i+1UL, j , xmm4 * factor );
8291 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
8292 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
8297 const size_t kbegin( ( IsUpper_v<MT4> )
8298 ?( ( IsLower_v<MT5> )
8299 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8300 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8301 :( IsLower_v<MT5> ? j : 0UL ) );
8302 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
8304 SIMDType xmm1, xmm2, xmm3;
8306 for(
size_t k=kbegin; k<kend; ++k ) {
8307 const SIMDType a1(
set( A(i,k) ) );
8308 xmm1 += a1 * B.load(k,j );
8309 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8310 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8313 C.store( i, j , xmm1 * factor );
8314 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8315 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8323 for(
size_t jj=j; jj<jjend; ++jj ) {
8332 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
8338 for(
size_t jj=j; jj<jjend; ++jj ) {
8339 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8346 for(
size_t jj=j; jj<jjend; ++jj ) {
8352 for( ; (i+4UL) <= iend; i+=4UL )
8354 const size_t kbegin( ( IsUpper_v<MT4> )
8355 ?( ( IsLower_v<MT5> )
8356 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8357 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8358 :( IsLower_v<MT5> ? j : 0UL ) );
8359 const size_t kend( ( IsLower_v<MT4> )
8360 ?( ( IsUpper_v<MT5> )
8361 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
8362 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8363 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8365 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8367 for(
size_t k=kbegin; k<kend; ++k ) {
8368 const SIMDType a1(
set( A(i ,k) ) );
8369 const SIMDType a2(
set( A(i+1UL,k) ) );
8370 const SIMDType a3(
set( A(i+2UL,k) ) );
8371 const SIMDType a4(
set( A(i+3UL,k) ) );
8372 const SIMDType b1( B.load(k,j ) );
8373 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8384 C.store( i , j , xmm1 * factor );
8385 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8386 C.store( i+1UL, j , xmm3 * factor );
8387 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8388 C.store( i+2UL, j , xmm5 * factor );
8389 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8390 C.store( i+3UL, j , xmm7 * factor );
8391 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
8394 for( ; (i+3UL) <= iend; i+=3UL )
8396 const size_t kbegin( ( IsUpper_v<MT4> )
8397 ?( ( IsLower_v<MT5> )
8398 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8399 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8400 :( IsLower_v<MT5> ? j : 0UL ) );
8401 const size_t kend( ( IsLower_v<MT4> )
8402 ?( ( IsUpper_v<MT5> )
8403 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
8404 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8405 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8407 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8409 for(
size_t k=kbegin; k<kend; ++k ) {
8410 const SIMDType a1(
set( A(i ,k) ) );
8411 const SIMDType a2(
set( A(i+1UL,k) ) );
8412 const SIMDType a3(
set( A(i+2UL,k) ) );
8413 const SIMDType b1( B.load(k,j ) );
8414 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8423 C.store( i , j , xmm1 * factor );
8424 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8425 C.store( i+1UL, j , xmm3 * factor );
8426 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8427 C.store( i+2UL, j , xmm5 * factor );
8428 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8431 for( ; (i+2UL) <= iend; i+=2UL )
8433 const size_t kbegin( ( IsUpper_v<MT4> )
8434 ?( ( IsLower_v<MT5> )
8435 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8436 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8437 :( IsLower_v<MT5> ? j : 0UL ) );
8438 const size_t kend( ( IsLower_v<MT4> )
8439 ?( ( IsUpper_v<MT5> )
8440 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
8441 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8442 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8444 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8447 for( ; (k+2UL) <= kend; k+=2UL ) {
8448 const SIMDType a1(
set( A(i ,k ) ) );
8449 const SIMDType a2(
set( A(i+1UL,k ) ) );
8450 const SIMDType a3(
set( A(i ,k+1UL) ) );
8451 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
8452 const SIMDType b1( B.load(k ,j ) );
8453 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
8454 const SIMDType b3( B.load(k+1UL,j ) );
8455 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
8466 for( ; k<kend; ++k ) {
8467 const SIMDType a1(
set( A(i ,k) ) );
8468 const SIMDType a2(
set( A(i+1UL,k) ) );
8469 const SIMDType b1( B.load(k,j ) );
8470 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8477 C.store( i , j , (xmm1+xmm5) * factor );
8478 C.store( i , j+
SIMDSIZE, (xmm2+xmm6) * factor );
8479 C.store( i+1UL, j , (xmm3+xmm7) * factor );
8480 C.store( i+1UL, j+
SIMDSIZE, (xmm4+xmm8) * factor );
8485 const size_t kbegin( ( IsUpper_v<MT4> )
8486 ?( ( IsLower_v<MT5> )
8487 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8488 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8489 :( IsLower_v<MT5> ? j : 0UL ) );
8490 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8492 SIMDType xmm1, xmm2, xmm3, xmm4;
8495 for( ; (k+2UL) <= kend; k+=2UL ) {
8496 const SIMDType a1(
set( A(i,k ) ) );
8497 const SIMDType a2(
set( A(i,k+1UL) ) );
8498 xmm1 += a1 * B.load(k ,j );
8499 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
8500 xmm3 += a2 * B.load(k+1UL,j );
8501 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
8504 for( ; k<kend; ++k ) {
8505 const SIMDType a1(
set( A(i,k) ) );
8506 xmm1 += a1 * B.load(k,j );
8510 C.store( i, j , (xmm1+xmm3) * factor );
8511 C.store( i, j+
SIMDSIZE, (xmm2+xmm4) * factor );
8519 for(
size_t jj=j; jj<jjend; ++jj ) {
8534 for(
size_t jj=j; jj<jjend; ++jj ) {
8535 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8542 for(
size_t jj=j; jj<jjend; ++jj ) {
8548 for( ; (i+4UL) <= iend; i+=4UL )
8550 const size_t kbegin( ( IsUpper_v<MT4> )
8551 ?( ( IsLower_v<MT5> )
8552 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8553 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8554 :( IsLower_v<MT5> ? j : 0UL ) );
8555 const size_t kend( ( IsLower_v<MT4> )
8556 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8559 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8562 for( ; (k+2UL) <= kend; k+=2UL ) {
8563 const SIMDType b1( B.load(k ,j) );
8564 const SIMDType b2( B.load(k+1UL,j) );
8565 xmm1 +=
set( A(i ,k ) ) * b1;
8566 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8567 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8568 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8569 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8570 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8571 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8572 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8575 for( ; k<kend; ++k ) {
8576 const SIMDType b1( B.load(k,j) );
8577 xmm1 +=
set( A(i ,k) ) * b1;
8578 xmm2 +=
set( A(i+1UL,k) ) * b1;
8579 xmm3 +=
set( A(i+2UL,k) ) * b1;
8580 xmm4 +=
set( A(i+3UL,k) ) * b1;
8583 C.store( i , j, (xmm1+xmm5) * factor );
8584 C.store( i+1UL, j, (xmm2+xmm6) * factor );
8585 C.store( i+2UL, j, (xmm3+xmm7) * factor );
8586 C.store( i+3UL, j, (xmm4+xmm8) * factor );
8589 for( ; (i+3UL) <= iend; i+=3UL )
8591 const size_t kbegin( ( IsUpper_v<MT4> )
8592 ?( ( IsLower_v<MT5> )
8593 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8594 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8595 :( IsLower_v<MT5> ? j : 0UL ) );
8596 const size_t kend( ( IsLower_v<MT4> )
8597 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8600 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8603 for( ; (k+2UL) <= kend; k+=2UL ) {
8604 const SIMDType b1( B.load(k ,j) );
8605 const SIMDType b2( B.load(k+1UL,j) );
8606 xmm1 +=
set( A(i ,k ) ) * b1;
8607 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8608 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8609 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8610 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8611 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8614 for( ; k<kend; ++k ) {
8615 const SIMDType b1( B.load(k,j) );
8616 xmm1 +=
set( A(i ,k) ) * b1;
8617 xmm2 +=
set( A(i+1UL,k) ) * b1;
8618 xmm3 +=
set( A(i+2UL,k) ) * b1;
8621 C.store( i , j, (xmm1+xmm4) * factor );
8622 C.store( i+1UL, j, (xmm2+xmm5) * factor );
8623 C.store( i+2UL, j, (xmm3+xmm6) * factor );
8626 for( ; (i+2UL) <= iend; i+=2UL )
8628 const size_t kbegin( ( IsUpper_v<MT4> )
8629 ?( ( IsLower_v<MT5> )
8630 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8631 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8632 :( IsLower_v<MT5> ? j : 0UL ) );
8633 const size_t kend( ( IsLower_v<MT4> )
8634 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8637 SIMDType xmm1, xmm2, xmm3, xmm4;
8640 for( ; (k+2UL) <= kend; k+=2UL ) {
8641 const SIMDType b1( B.load(k ,j) );
8642 const SIMDType b2( B.load(k+1UL,j) );
8643 xmm1 +=
set( A(i ,k ) ) * b1;
8644 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8645 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8646 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8649 for( ; k<kend; ++k ) {
8650 const SIMDType b1( B.load(k,j) );
8651 xmm1 +=
set( A(i ,k) ) * b1;
8652 xmm2 +=
set( A(i+1UL,k) ) * b1;
8655 C.store( i , j, (xmm1+xmm3) * factor );
8656 C.store( i+1UL, j, (xmm2+xmm4) * factor );
8661 const size_t kbegin( ( IsUpper_v<MT4> )
8662 ?( ( IsLower_v<MT5> )
8663 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8664 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8665 :( IsLower_v<MT5> ? j : 0UL ) );
8667 SIMDType xmm1, xmm2;
8670 for( ; (k+2UL) <= K; k+=2UL ) {
8671 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8672 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8676 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8679 C.store( i, j, (xmm1+xmm2) * factor );
8687 for(
size_t jj=j; jj<jjend; ++jj ) {
8694 for( ; remainder && j<N; ++j )
8700 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
8709 for( ; (i+2UL) <= M; i+=2UL )
8711 const size_t kbegin( ( IsUpper_v<MT4> )
8712 ?( ( IsLower_v<MT5> )
8713 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8714 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8715 :( IsLower_v<MT5> ? j : 0UL ) );
8716 const size_t kend( ( IsLower_v<MT4> )
8717 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8723 for(
size_t k=kbegin; k<kend; ++k ) {
8724 value1 += A(i ,k) * B(k,j);
8725 value2 += A(i+1UL,k) * B(k,j);
8728 C(i ,j) = value1 * scalar;
8729 C(i+1UL,j) = value2 * scalar;
8734 const size_t kbegin( ( IsUpper_v<MT4> )
8735 ?( ( IsLower_v<MT5> )
8736 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8737 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8738 :( IsLower_v<MT5> ? j : 0UL ) );
8742 for(
size_t k=kbegin; k<K; ++k ) {
8743 value += A(i,k) * B(k,j);
8746 C(i,j) = value * scalar;
8767 template<
typename MT3
8771 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8772 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8774 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8776 const size_t M( A.rows() );
8777 const size_t N( B.columns() );
8778 const size_t K( A.columns() );
8782 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
8785 const SIMDType factor(
set( scalar ) );
8789 if( IsIntegral_v<ElementType> )
8791 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
8792 for(
size_t j=0UL; j<N; ++j )
8794 const size_t kbegin( ( IsLower_v<MT5> )
8795 ?( ( IsUpper_v<MT4> )
8796 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8797 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8798 :( IsUpper_v<MT4> ? i : 0UL ) );
8799 const size_t kend( ( IsUpper_v<MT5> )
8800 ?( ( IsLower_v<MT4> )
8801 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8802 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8803 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
8805 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8807 for(
size_t k=kbegin; k<kend; ++k ) {
8808 const SIMDType b1(
set( B(k,j) ) );
8809 xmm1 += A.load(i ,k) * b1;
8810 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8811 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8812 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8813 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8814 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
8815 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
8816 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
8819 C.store( i , j, xmm1 * factor );
8820 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8821 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8822 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
8823 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
8824 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
8825 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
8826 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
8831 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
8835 for( ; (j+2UL) <= N; j+=2UL )
8837 const size_t kbegin( ( IsLower_v<MT5> )
8838 ?( ( IsUpper_v<MT4> )
8839 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8840 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8841 :( IsUpper_v<MT4> ? i : 0UL ) );
8842 const size_t kend( ( IsUpper_v<MT5> )
8843 ?( ( IsLower_v<MT4> )
8844 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8845 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8846 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
8848 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
8850 for(
size_t k=kbegin; k<kend; ++k ) {
8851 const SIMDType a1( A.load(i ,k) );
8852 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8853 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8854 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8855 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
8856 const SIMDType b1(
set( B(k,j ) ) );
8857 const SIMDType b2(
set( B(k,j+1UL) ) );
8870 C.store( i , j , xmm1 * factor );
8871 C.store( i+
SIMDSIZE , j , xmm2 * factor );
8872 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
8873 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
8874 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
8875 C.store( i , j+1UL, xmm6 * factor );
8876 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
8877 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
8878 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
8879 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
8884 const size_t kbegin( ( IsLower_v<MT5> )
8885 ?( ( IsUpper_v<MT4> )
8886 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8887 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8888 :( IsUpper_v<MT4> ? i : 0UL ) );
8889 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
8891 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
8893 for(
size_t k=kbegin; k<kend; ++k ) {
8894 const SIMDType b1(
set( B(k,j) ) );
8895 xmm1 += A.load(i ,k) * b1;
8896 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8897 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8898 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8899 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8902 C.store( i , j, xmm1 * factor );
8903 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8904 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8905 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
8906 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
8912 const size_t jend( LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
8918 for(
size_t ii=i; ii<iiend; ++ii ) {
8919 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
8926 for(
size_t ii=i; ii<iiend; ++ii ) {
8932 for( ; (j+2UL) <= jend; j+=2UL )
8934 const size_t kbegin( ( IsLower_v<MT5> )
8935 ?( ( IsUpper_v<MT4> )
8936 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8937 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8938 :( IsUpper_v<MT4> ? i : 0UL ) );
8939 const size_t kend( ( IsUpper_v<MT5> )
8940 ?( ( IsLower_v<MT4> )
8941 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8942 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8943 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
8945 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8947 for(
size_t k=kbegin; k<kend; ++k ) {
8948 const SIMDType a1( A.load(i ,k) );
8949 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8950 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8951 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8952 const SIMDType b1(
set( B(k,j ) ) );
8953 const SIMDType b2(
set( B(k,j+1UL) ) );
8964 C.store( i , j , xmm1 * factor );
8965 C.store( i+
SIMDSIZE , j , xmm2 * factor );
8966 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
8967 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
8968 C.store( i , j+1UL, xmm5 * factor );
8969 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
8970 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
8971 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
8976 const size_t kbegin( ( IsLower_v<MT5> )
8977 ?( ( IsUpper_v<MT4> )
8978 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8979 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8980 :( IsUpper_v<MT4> ? i : 0UL ) );
8981 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
8983 SIMDType xmm1, xmm2, xmm3, xmm4;
8985 for(
size_t k=kbegin; k<kend; ++k ) {
8986 const SIMDType b1(
set( B(k,j) ) );
8987 xmm1 += A.load(i ,k) * b1;
8988 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8989 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8990 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8993 C.store( i , j, xmm1 * factor );
8994 C.store( i+
SIMDSIZE , j, xmm2 * factor );
8995 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
8996 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
9004 for(
size_t ii=i; ii<iiend; ++ii ) {
9013 const size_t jend( LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
9019 for(
size_t ii=i; ii<iiend; ++ii ) {
9020 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9027 for(
size_t ii=i; ii<iiend; ++ii ) {
9033 for( ; (j+2UL) <= jend; j+=2UL )
9035 const size_t kbegin( ( IsLower_v<MT5> )
9036 ?( ( IsUpper_v<MT4> )
9037 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9038 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9039 :( IsUpper_v<MT4> ? i : 0UL ) );
9040 const size_t kend( ( IsUpper_v<MT5> )
9041 ?( ( IsLower_v<MT4> )
9042 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9043 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9044 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
9046 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9048 for(
size_t k=kbegin; k<kend; ++k ) {
9049 const SIMDType a1( A.load(i ,k) );
9050 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
9051 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
9052 const SIMDType b1(
set( B(k,j ) ) );
9053 const SIMDType b2(
set( B(k,j+1UL) ) );
9062 C.store( i , j , xmm1 * factor );
9063 C.store( i+
SIMDSIZE , j , xmm2 * factor );
9064 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
9065 C.store( i , j+1UL, xmm4 * factor );
9066 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
9067 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
9072 const size_t kbegin( ( IsLower_v<MT5> )
9073 ?( ( IsUpper_v<MT4> )
9074 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9075 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9076 :( IsUpper_v<MT4> ? i : 0UL ) );
9077 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
9079 SIMDType xmm1, xmm2, xmm3;
9081 for(
size_t k=kbegin; k<kend; ++k ) {
9082 const SIMDType b1(
set( B(k,j) ) );
9083 xmm1 += A.load(i ,k) * b1;
9084 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
9085 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
9088 C.store( i , j, xmm1 * factor );
9089 C.store( i+
SIMDSIZE , j, xmm2 * factor );
9090 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
9098 for(
size_t ii=i; ii<iiend; ++ii ) {
9107 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
9113 for(
size_t ii=i; ii<iiend; ++ii ) {
9114 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9121 for(
size_t ii=i; ii<iiend; ++ii ) {
9127 for( ; (j+4UL) <= jend; j+=4UL )
9129 const size_t kbegin( ( IsLower_v<MT5> )
9130 ?( ( IsUpper_v<MT4> )
9131 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9132 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9133 :( IsUpper_v<MT4> ? i : 0UL ) );
9134 const size_t kend( ( IsUpper_v<MT5> )
9135 ?( ( IsLower_v<MT4> )
9136 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
9137 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
9138 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9140 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9142 for(
size_t k=kbegin; k<kend; ++k ) {
9143 const SIMDType a1( A.load(i ,k) );
9144 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
9145 const SIMDType b1(
set( B(k,j ) ) );
9146 const SIMDType b2(
set( B(k,j+1UL) ) );
9147 const SIMDType b3(
set( B(k,j+2UL) ) );
9148 const SIMDType b4(
set( B(k,j+3UL) ) );
9159 C.store( i , j , xmm1 * factor );
9160 C.store( i+
SIMDSIZE, j , xmm2 * factor );
9161 C.store( i , j+1UL, xmm3 * factor );
9162 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
9163 C.store( i , j+2UL, xmm5 * factor );
9164 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
9165 C.store( i , j+3UL, xmm7 * factor );
9166 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
9169 for( ; (j+3UL) <= jend; j+=3UL )
9171 const size_t kbegin( ( IsLower_v<MT5> )
9172 ?( ( IsUpper_v<MT4> )
9173 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9174 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9175 :( IsUpper_v<MT4> ? i : 0UL ) );
9176 const size_t kend( ( IsUpper_v<MT5> )
9177 ?( ( IsLower_v<MT4> )
9178 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
9179 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
9180 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9182 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9184 for(
size_t k=kbegin; k<kend; ++k ) {
9185 const SIMDType a1( A.load(i ,k) );
9186 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
9187 const SIMDType b1(
set( B(k,j ) ) );
9188 const SIMDType b2(
set( B(k,j+1UL) ) );
9189 const SIMDType b3(
set( B(k,j+2UL) ) );
9198 C.store( i , j , xmm1 * factor );
9199 C.store( i+
SIMDSIZE, j , xmm2 * factor );
9200 C.store( i , j+1UL, xmm3 * factor );
9201 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
9202 C.store( i , j+2UL, xmm5 * factor );
9203 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
9206 for( ; (j+2UL) <= jend; j+=2UL )
9208 const size_t kbegin( ( IsLower_v<MT5> )
9209 ?( ( IsUpper_v<MT4> )
9210 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9211 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9212 :( IsUpper_v<MT4> ? i : 0UL ) );
9213 const size_t kend( ( IsUpper_v<MT5> )
9214 ?( ( IsLower_v<MT4> )
9215 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9216 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9217 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9219 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9222 for( ; (k+2UL) <= kend; k+=2UL ) {
9223 const SIMDType a1( A.load(i ,k ) );
9224 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
9225 const SIMDType a3( A.load(i ,k+1UL) );
9226 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
9227 const SIMDType b1(
set( B(k ,j ) ) );
9228 const SIMDType b2(
set( B(k ,j+1UL) ) );
9229 const SIMDType b3(
set( B(k+1UL,j ) ) );
9230 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
9241 for( ; k<kend; ++k ) {
9242 const SIMDType a1( A.load(i ,k) );
9243 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
9244 const SIMDType b1(
set( B(k,j ) ) );
9245 const SIMDType b2(
set( B(k,j+1UL) ) );
9252 C.store( i , j , (xmm1+xmm5) * factor );
9253 C.store( i+
SIMDSIZE, j , (xmm2+xmm6) * factor );
9254 C.store( i , j+1UL, (xmm3+xmm7) * factor );
9255 C.store( i+
SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
9260 const size_t kbegin( ( IsLower_v<MT5> )
9261 ?( ( IsUpper_v<MT4> )
9262 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9263 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9264 :( IsUpper_v<MT4> ? i : 0UL ) );
9265 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
9267 SIMDType xmm1, xmm2, xmm3, xmm4;
9270 for( ; (k+2UL) <= kend; k+=2UL ) {
9271 const SIMDType b1(
set( B(k ,j) ) );
9272 const SIMDType b2(
set( B(k+1UL,j) ) );
9273 xmm1 += A.load(i ,k ) * b1;
9274 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
9275 xmm3 += A.load(i ,k+1UL) * b2;
9276 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
9279 for( ; k<kend; ++k ) {
9280 const SIMDType b1(
set( B(k,j) ) );
9281 xmm1 += A.load(i ,k) * b1;
9285 C.store( i , j, (xmm1+xmm3) * factor );
9286 C.store( i+
SIMDSIZE, j, (xmm2+xmm4) * factor );
9294 for(
size_t ii=i; ii<iiend; ++ii ) {
9309 for(
size_t ii=i; ii<iiend; ++ii ) {
9310 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9317 for(
size_t ii=i; ii<iiend; ++ii ) {
9323 for( ; (j+4UL) <= jend; j+=4UL )
9325 const size_t kbegin( ( IsLower_v<MT5> )
9326 ?( ( IsUpper_v<MT4> )
9327 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9328 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9329 :( IsUpper_v<MT4> ? i : 0UL ) );
9330 const size_t kend( ( IsUpper_v<MT5> )
9331 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
9334 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
9337 for( ; (k+2UL) <= kend; k+=2UL ) {
9338 const SIMDType a1( A.load(i,k ) );
9339 const SIMDType a2( A.load(i,k+1UL) );
9340 xmm1 += a1 *
set( B(k ,j ) );
9341 xmm2 += a1 *
set( B(k ,j+1UL) );
9342 xmm3 += a1 *
set( B(k ,j+2UL) );
9343 xmm4 += a1 *
set( B(k ,j+3UL) );
9344 xmm5 += a2 *
set( B(k+1UL,j ) );
9345 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
9346 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
9347 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
9350 for( ; k<kend; ++k ) {
9351 const SIMDType a1( A.load(i,k) );
9352 xmm1 += a1 *
set( B(k,j ) );
9353 xmm2 += a1 *
set( B(k,j+1UL) );
9354 xmm3 += a1 *
set( B(k,j+2UL) );
9355 xmm4 += a1 *
set( B(k,j+3UL) );
9358 C.store( i, j , (xmm1+xmm5) * factor );
9359 C.store( i, j+1UL, (xmm2+xmm6) * factor );
9360 C.store( i, j+2UL, (xmm3+xmm7) * factor );
9361 C.store( i, j+3UL, (xmm4+xmm8) * factor );
9364 for( ; (j+3UL) <= jend; j+=3UL )
9366 const size_t kbegin( ( IsLower_v<MT5> )
9367 ?( ( IsUpper_v<MT4> )
9368 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9369 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9370 :( IsUpper_v<MT4> ? i : 0UL ) );
9371 const size_t kend( ( IsUpper_v<MT5> )
9372 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
9375 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
9378 for( ; (k+2UL) <= kend; k+=2UL ) {
9379 const SIMDType a1( A.load(i,k ) );
9380 const SIMDType a2( A.load(i,k+1UL) );
9381 xmm1 += a1 *
set( B(k ,j ) );
9382 xmm2 += a1 *
set( B(k ,j+1UL) );
9383 xmm3 += a1 *
set( B(k ,j+2UL) );
9384 xmm4 += a2 *
set( B(k+1UL,j ) );
9385 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
9386 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
9389 for( ; k<kend; ++k ) {
9390 const SIMDType a1( A.load(i,k) );
9391 xmm1 += a1 *
set( B(k,j ) );
9392 xmm2 += a1 *
set( B(k,j+1UL) );
9393 xmm3 += a1 *
set( B(k,j+2UL) );
9396 C.store( i, j , (xmm1+xmm4) * factor );
9397 C.store( i, j+1UL, (xmm2+xmm5) * factor );
9398 C.store( i, j+2UL, (xmm3+xmm6) * factor );
9401 for( ; (j+2UL) <= jend; j+=2UL )
9403 const size_t kbegin( ( IsLower_v<MT5> )
9404 ?( ( IsUpper_v<MT4> )
9405 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9406 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9407 :( IsUpper_v<MT4> ? i : 0UL ) );
9408 const size_t kend( ( IsUpper_v<MT5> )
9409 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9412 SIMDType xmm1, xmm2, xmm3, xmm4;
9415 for( ; k<kend; ++k ) {
9416 const SIMDType a1( A.load(i,k) );
9417 xmm1 += a1 *
set( B(k,j ) );
9418 xmm2 += a1 *
set( B(k,j+1UL) );
9421 for( ; (k+2UL) <= kend; k+=2UL ) {
9422 const SIMDType a1( A.load(i,k ) );
9423 const SIMDType a2( A.load(i,k+1UL) );
9424 xmm1 += a1 *
set( B(k ,j ) );
9425 xmm2 += a1 *
set( B(k ,j+1UL) );
9426 xmm3 += a2 *
set( B(k+1UL,j ) );
9427 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
9430 C.store( i, j , (xmm1+xmm3) * factor );
9431 C.store( i, j+1UL, (xmm2+xmm4) * factor );
9436 const size_t kbegin( ( IsLower_v<MT5> )
9437 ?( ( IsUpper_v<MT4> )
9438 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9439 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9440 :( IsUpper_v<MT4> ? i : 0UL ) );
9442 SIMDType xmm1, xmm2;
9445 for( ; (k+2UL) <= K; k+=2UL ) {
9446 xmm1 += A.load(i,k ) *
set( B(k ,j) );
9447 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
9451 xmm1 += A.load(i,k) *
set( B(k,j) );
9454 C.store( i, j, (xmm1+xmm2) * factor );
9462 for(
size_t ii=i; ii<iiend; ++ii ) {
9469 for( ; remainder && i<M; ++i )
9475 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
9484 for( ; (j+2UL) <= N; j+=2UL )
9486 const size_t kbegin( ( IsLower_v<MT5> )
9487 ?( ( IsUpper_v<MT4> )
9488 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9489 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9490 :( IsUpper_v<MT4> ? i : 0UL ) );
9491 const size_t kend( ( IsUpper_v<MT5> )
9492 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9498 for(
size_t k=kbegin; k<kend; ++k ) {
9499 value1 += A(i,k) * B(k,j );
9500 value2 += A(i,k) * B(k,j+1UL);
9503 C(i,j ) = value1 * scalar;
9504 C(i,j+1UL) = value2 * scalar;
9509 const size_t kbegin( ( IsLower_v<MT5> )
9510 ?( ( IsUpper_v<MT4> )
9511 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9512 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9513 :( IsUpper_v<MT4> ? i : 0UL ) );
9517 for(
size_t k=kbegin; k<K; ++k ) {
9518 value += A(i,k) * B(k,j);
9521 C(i,j) = value * scalar;
9541 template<
typename MT3
9545 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9546 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9548 selectDefaultAssignKernel( C, A, B, scalar );
9567 template<
typename MT3
9571 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9572 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9575 smmm( C, A, B, scalar );
9577 hmmm( C, A, B, scalar );
9579 lmmm( C, A, B, scalar, ST2(0) );
9581 ummm( C, A, B, scalar, ST2(0) );
9583 mmm( C, A, B, scalar, ST2(0) );
9601 template<
typename MT3
9605 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9606 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9608 selectLargeAssignKernel( C, A, B, scalar );
9613 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 9627 template<
typename MT3
9631 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9632 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9634 using ET = ElementType_t<MT3>;
9636 if( IsTriangular_v<MT4> ) {
9638 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9640 else if( IsTriangular_v<MT5> ) {
9642 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9645 gemm( C, A, B,
ET(scalar),
ET(0) );
9663 template<
typename MT
9669 using TmpType = If_t< SO, ResultType, OppositeType >;
9681 const ForwardFunctor fwd;
9683 const TmpType tmp(
serial( rhs ) );
9684 assign( ~lhs, fwd( tmp ) );
9700 template<
typename MT
9702 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9709 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9710 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9712 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
9726 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
9741 template<
typename MT3
9745 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9747 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
9748 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
9749 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
9750 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
9751 selectSmallAddAssignKernel( C, A, B, scalar );
9753 selectBlasAddAssignKernel( C, A, B, scalar );
9771 template<
typename MT3
9775 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9776 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9779 addAssign( C, tmp );
9797 template<
typename MT3
9801 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9802 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9804 constexpr
size_t block( BLOCK_SIZE );
9806 const size_t M( A.rows() );
9807 const size_t N( B.columns() );
9809 for(
size_t ii=0UL; ii<M; ii+=block ) {
9810 const size_t iend(
min( M, ii+block ) );
9811 for(
size_t jj=0UL; jj<N; jj+=block ) {
9812 const size_t jend(
min( N, jj+block ) );
9813 for(
size_t i=ii; i<iend; ++i )
9815 const size_t jbegin( ( IsUpper_v<MT4> )
9816 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
9818 const size_t jpos( ( IsLower_v<MT4> )
9819 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
9822 for(
size_t j=jbegin; j<jpos; ++j ) {
9823 C(i,j) += A(i,j) * B(j,j) * scalar;
9845 template<
typename MT3
9849 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9850 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9852 const size_t M( A.rows() );
9853 const size_t N( B.columns() );
9855 for(
size_t j=0UL; j<N; ++j )
9857 const size_t ibegin( ( IsLower_v<MT4> )
9858 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
9860 const size_t iend( ( IsUpper_v<MT4> )
9861 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
9865 const size_t inum( iend - ibegin );
9866 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
9868 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9869 C(i ,j) += A(i ,j) * B(j,j) * scalar;
9870 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
9873 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
9893 template<
typename MT3
9897 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9898 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9900 const size_t M( A.rows() );
9901 const size_t N( B.columns() );
9903 for(
size_t i=0UL; i<M; ++i )
9905 const size_t jbegin( ( IsUpper_v<MT5> )
9906 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
9908 const size_t jend( ( IsLower_v<MT5> )
9909 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
9913 const size_t jnum( jend - jbegin );
9914 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
9916 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9917 C(i,j ) += A(i,i) * B(i,j ) * scalar;
9918 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
9921 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
9941 template<
typename MT3
9945 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9946 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9948 constexpr
size_t block( BLOCK_SIZE );
9950 const size_t M( A.rows() );
9951 const size_t N( B.columns() );
9953 for(
size_t jj=0UL; jj<N; jj+=block ) {
9954 const size_t jend(
min( N, jj+block ) );
9955 for(
size_t ii=0UL; ii<M; ii+=block ) {
9956 const size_t iend(
min( M, ii+block ) );
9957 for(
size_t j=jj; j<jend; ++j )
9959 const size_t ibegin( ( IsLower_v<MT5> )
9960 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
9962 const size_t ipos( ( IsUpper_v<MT5> )
9963 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
9966 for(
size_t i=ibegin; i<ipos; ++i ) {
9967 C(i,j) += A(i,i) * B(i,j) * scalar;
9989 template<
typename MT3
9993 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9994 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9996 for(
size_t i=0UL; i<A.rows(); ++i ) {
9997 C(i,i) += A(i,i) * B(i,i) * scalar;
10016 template<
typename MT3
10020 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10021 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10023 selectDefaultAddAssignKernel( C, A, B, scalar );
10042 template<
typename MT3
10046 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10047 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10049 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
10051 const size_t M( A.rows() );
10052 const size_t N( B.columns() );
10053 const size_t K( A.columns() );
10057 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
10060 const SIMDType factor(
set( scalar ) );
10064 if( IsIntegral_v<ElementType> )
10067 for(
size_t i=0UL; i<M; ++i )
10069 const size_t kbegin( ( IsUpper_v<MT4> )
10070 ?( ( IsLower_v<MT5> )
10071 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10072 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10073 :( IsLower_v<MT5> ? j : 0UL ) );
10074 const size_t kend( ( IsLower_v<MT4> )
10075 ?( ( IsUpper_v<MT5> )
10076 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
10077 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
10078 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
10080 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10082 for(
size_t k=kbegin; k<kend; ++k ) {
10083 const SIMDType a1(
set( A(i,k) ) );
10084 xmm1 += a1 * B.load(k,j );
10085 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10086 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10087 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
10088 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
10089 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
10090 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
10091 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
10094 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10110 for( ; (i+2UL) <= M; i+=2UL )
10112 const size_t kbegin( ( IsUpper_v<MT4> )
10113 ?( ( IsLower_v<MT5> )
10114 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10115 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10116 :( IsLower_v<MT5> ? j : 0UL ) );
10117 const size_t kend( ( IsLower_v<MT4> )
10118 ?( ( IsUpper_v<MT5> )
10119 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
10120 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10121 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
10123 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10125 for(
size_t k=kbegin; k<kend; ++k ) {
10126 const SIMDType a1(
set( A(i ,k) ) );
10127 const SIMDType a2(
set( A(i+1UL,k) ) );
10128 const SIMDType b1( B.load(k,j ) );
10129 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
10130 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
10131 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
10132 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
10145 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10150 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
10152 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
10153 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
10154 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
10159 const size_t kbegin( ( IsUpper_v<MT4> )
10160 ?( ( IsLower_v<MT5> )
10161 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10162 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10163 :( IsLower_v<MT5> ? j : 0UL ) );
10164 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
10166 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10168 for(
size_t k=kbegin; k<kend; ++k ) {
10169 const SIMDType a1(
set( A(i,k) ) );
10170 xmm1 += a1 * B.load(k,j );
10171 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10172 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10173 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
10174 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
10177 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10189 for( ; (i+2UL) <= M; i+=2UL )
10191 const size_t kbegin( ( IsUpper_v<MT4> )
10192 ?( ( IsLower_v<MT5> )
10193 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10194 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10195 :( IsLower_v<MT5> ? j : 0UL ) );
10196 const size_t kend( ( IsLower_v<MT4> )
10197 ?( ( IsUpper_v<MT5> )
10198 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
10199 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10200 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
10202 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10204 for(
size_t k=kbegin; k<kend; ++k ) {
10205 const SIMDType a1(
set( A(i ,k) ) );
10206 const SIMDType a2(
set( A(i+1UL,k) ) );
10207 const SIMDType b1( B.load(k,j ) );
10208 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
10209 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
10210 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
10221 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10225 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
10227 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
10228 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
10233 const size_t kbegin( ( IsUpper_v<MT4> )
10234 ?( ( IsLower_v<MT5> )
10235 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10236 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10237 :( IsLower_v<MT5> ? j : 0UL ) );
10238 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
10240 SIMDType xmm1, xmm2, xmm3, xmm4;
10242 for(
size_t k=kbegin; k<kend; ++k ) {
10243 const SIMDType a1(
set( A(i,k) ) );
10244 xmm1 += a1 * B.load(k,j );
10245 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10246 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10247 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
10250 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10261 for( ; (i+2UL) <= M; i+=2UL )
10263 const size_t kbegin( ( IsUpper_v<MT4> )
10264 ?( ( IsLower_v<MT5> )
10265 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10266 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10267 :( IsLower_v<MT5> ? j : 0UL ) );
10268 const size_t kend( ( IsLower_v<MT4> )
10269 ?( ( IsUpper_v<MT5> )
10270 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
10271 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10272 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
10274 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10276 for(
size_t k=kbegin; k<kend; ++k ) {
10277 const SIMDType a1(
set( A(i ,k) ) );
10278 const SIMDType a2(
set( A(i+1UL,k) ) );
10279 const SIMDType b1( B.load(k,j ) );
10280 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
10281 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
10290 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10293 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
10295 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
10300 const size_t kbegin( ( IsUpper_v<MT4> )
10301 ?( ( IsLower_v<MT5> )
10302 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10303 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10304 :( IsLower_v<MT5> ? j : 0UL ) );
10305 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
10307 SIMDType xmm1, xmm2, xmm3;
10309 for(
size_t k=kbegin; k<kend; ++k ) {
10310 const SIMDType a1(
set( A(i,k) ) );
10311 xmm1 += a1 * B.load(k,j );
10312 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10313 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10316 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10324 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
10325 size_t i( LOW ? j : 0UL );
10327 for( ; (i+4UL) <= iend; i+=4UL )
10329 const size_t kbegin( ( IsUpper_v<MT4> )
10330 ?( ( IsLower_v<MT5> )
10331 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10332 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10333 :( IsLower_v<MT5> ? j : 0UL ) );
10334 const size_t kend( ( IsLower_v<MT4> )
10335 ?( ( IsUpper_v<MT5> )
10336 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
10337 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
10338 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
10340 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10342 for(
size_t k=kbegin; k<kend; ++k ) {
10343 const SIMDType a1(
set( A(i ,k) ) );
10344 const SIMDType a2(
set( A(i+1UL,k) ) );
10345 const SIMDType a3(
set( A(i+2UL,k) ) );
10346 const SIMDType a4(
set( A(i+3UL,k) ) );
10347 const SIMDType b1( B.load(k,j ) );
10348 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
10359 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10361 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10363 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10365 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
10369 for( ; (i+3UL) <= iend; i+=3UL )
10371 const size_t kbegin( ( IsUpper_v<MT4> )
10372 ?( ( IsLower_v<MT5> )
10373 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10374 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10375 :( IsLower_v<MT5> ? j : 0UL ) );
10376 const size_t kend( ( IsLower_v<MT4> )
10377 ?( ( IsUpper_v<MT5> )
10378 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
10379 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
10380 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
10382 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10384 for(
size_t k=kbegin; k<kend; ++k ) {
10385 const SIMDType a1(
set( A(i ,k) ) );
10386 const SIMDType a2(
set( A(i+1UL,k) ) );
10387 const SIMDType a3(
set( A(i+2UL,k) ) );
10388 const SIMDType b1( B.load(k,j ) );
10389 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
10398 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10400 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
10402 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
10406 for( ; (i+2UL) <= iend; i+=2UL )
10408 const size_t kbegin( ( IsUpper_v<MT4> )
10409 ?( ( IsLower_v<MT5> )
10410 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10411 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10412 :( IsLower_v<MT5> ? j : 0UL ) );
10413 const size_t kend( ( IsLower_v<MT4> )
10414 ?( ( IsUpper_v<MT5> )
10415 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
10416 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10417 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
10419 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10420 size_t k( kbegin );
10422 for( ; (k+2UL) <= kend; k+=2UL ) {
10423 const SIMDType a1(
set( A(i ,k ) ) );
10424 const SIMDType a2(
set( A(i+1UL,k ) ) );
10425 const SIMDType a3(
set( A(i ,k+1UL) ) );
10426 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
10427 const SIMDType b1( B.load(k ,j ) );
10428 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
10429 const SIMDType b3( B.load(k+1UL,j ) );
10430 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
10441 for( ; k<kend; ++k ) {
10442 const SIMDType a1(
set( A(i ,k) ) );
10443 const SIMDType a2(
set( A(i+1UL,k) ) );
10444 const SIMDType b1( B.load(k,j ) );
10445 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
10452 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
10454 C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
10455 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) + (xmm4+xmm8) * factor );
10460 const size_t kbegin( ( IsUpper_v<MT4> )
10461 ?( ( IsLower_v<MT5> )
10462 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10464 :( IsLower_v<MT5> ? j : 0UL ) );
10465 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
10467 SIMDType xmm1, xmm2, xmm3, xmm4;
10468 size_t k( kbegin );
10470 for( ; (k+2UL) <= kend; k+=2UL ) {
10471 const SIMDType a1(
set( A(i,k ) ) );
10472 const SIMDType a2(
set( A(i,k+1UL) ) );
10473 xmm1 += a1 * B.load(k ,j );
10474 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
10475 xmm3 += a2 * B.load(k+1UL,j );
10476 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
10479 for( ; k<kend; ++k ) {
10480 const SIMDType a1(
set( A(i,k) ) );
10481 xmm1 += a1 * B.load(k,j );
10482 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
10485 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
10492 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
10493 size_t i( LOW ? j : 0UL );
10495 for( ; (i+4UL) <= iend; i+=4UL )
10497 const size_t kbegin( ( IsUpper_v<MT4> )
10498 ?( ( IsLower_v<MT5> )
10499 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10500 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10501 :( IsLower_v<MT5> ? j : 0UL ) );
10502 const size_t kend( ( IsLower_v<MT4> )
10503 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
10506 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10507 size_t k( kbegin );
10509 for( ; (k+2UL) <= kend; k+=2UL ) {
10510 const SIMDType b1( B.load(k ,j) );
10511 const SIMDType b2( B.load(k+1UL,j) );
10512 xmm1 +=
set( A(i ,k ) ) * b1;
10513 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10514 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10515 xmm4 +=
set( A(i+3UL,k ) ) * b1;
10516 xmm5 +=
set( A(i ,k+1UL) ) * b2;
10517 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
10518 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
10519 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
10522 for( ; k<kend; ++k ) {
10523 const SIMDType b1( B.load(k,j) );
10524 xmm1 +=
set( A(i ,k) ) * b1;
10525 xmm2 +=
set( A(i+1UL,k) ) * b1;
10526 xmm3 +=
set( A(i+2UL,k) ) * b1;
10527 xmm4 +=
set( A(i+3UL,k) ) * b1;
10530 C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
10531 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
10532 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
10533 C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
10536 for( ; (i+3UL) <= iend; i+=3UL )
10538 const size_t kbegin( ( IsUpper_v<MT4> )
10539 ?( ( IsLower_v<MT5> )
10540 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10541 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10542 :( IsLower_v<MT5> ? j : 0UL ) );
10543 const size_t kend( ( IsLower_v<MT4> )
10544 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
10547 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10548 size_t k( kbegin );
10550 for( ; (k+2UL) <= kend; k+=2UL ) {
10551 const SIMDType b1( B.load(k ,j) );
10552 const SIMDType b2( B.load(k+1UL,j) );
10553 xmm1 +=
set( A(i ,k ) ) * b1;
10554 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10555 xmm3 +=
set( A(i+2UL,k ) ) * b1;
10556 xmm4 +=
set( A(i ,k+1UL) ) * b2;
10557 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
10558 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
10561 for( ; k<kend; ++k ) {
10562 const SIMDType b1( B.load(k,j) );
10563 xmm1 +=
set( A(i ,k) ) * b1;
10564 xmm2 +=
set( A(i+1UL,k) ) * b1;
10565 xmm3 +=
set( A(i+2UL,k) ) * b1;
10568 C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
10569 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
10570 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
10573 for( ; (i+2UL) <= iend; i+=2UL )
10575 const size_t kbegin( ( IsUpper_v<MT4> )
10576 ?( ( IsLower_v<MT5> )
10577 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10578 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10579 :( IsLower_v<MT5> ? j : 0UL ) );
10580 const size_t kend( ( IsLower_v<MT4> )
10581 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10584 SIMDType xmm1, xmm2, xmm3, xmm4;
10585 size_t k( kbegin );
10587 for( ; (k+2UL) <= kend; k+=2UL ) {
10588 const SIMDType b1( B.load(k ,j) );
10589 const SIMDType b2( B.load(k+1UL,j) );
10590 xmm1 +=
set( A(i ,k ) ) * b1;
10591 xmm2 +=
set( A(i+1UL,k ) ) * b1;
10592 xmm3 +=
set( A(i ,k+1UL) ) * b2;
10593 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
10596 for( ; k<kend; ++k ) {
10597 const SIMDType b1( B.load(k,j) );
10598 xmm1 +=
set( A(i ,k) ) * b1;
10599 xmm2 +=
set( A(i+1UL,k) ) * b1;
10602 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
10603 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
10608 const size_t kbegin( ( IsUpper_v<MT4> )
10609 ?( ( IsLower_v<MT5> )
10610 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10611 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10612 :( IsLower_v<MT5> ? j : 0UL ) );
10614 SIMDType xmm1, xmm2;
10615 size_t k( kbegin );
10617 for( ; (k+2UL) <= K; k+=2UL ) {
10618 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
10619 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
10622 for( ; k<K; ++k ) {
10623 xmm1 +=
set( A(i,k) ) * B.load(k,j);
10626 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
10630 for( ; remainder && j<N; ++j )
10632 const size_t iend( UPP ? j+1UL : M );
10633 size_t i( LOW ? j : 0UL );
10635 for( ; (i+2UL) <= iend; i+=2UL )
10637 const size_t kbegin( ( IsUpper_v<MT4> )
10638 ?( ( IsLower_v<MT5> )
10639 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10640 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10641 :( IsLower_v<MT5> ? j : 0UL ) );
10642 const size_t kend( ( IsLower_v<MT4> )
10643 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
10649 for(
size_t k=kbegin; k<kend; ++k ) {
10650 value1 += A(i ,k) * B(k,j);
10651 value2 += A(i+1UL,k) * B(k,j);
10654 C(i ,j) += value1 * scalar;
10655 C(i+1UL,j) += value2 * scalar;
10660 const size_t kbegin( ( IsUpper_v<MT4> )
10661 ?( ( IsLower_v<MT5> )
10662 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10663 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10664 :( IsLower_v<MT5> ? j : 0UL ) );
10668 for(
size_t k=kbegin; k<K; ++k ) {
10669 value += A(i,k) * B(k,j);
10672 C(i,j) += value * scalar;
10693 template<
typename MT3
10697 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10698 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10700 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
10702 const size_t M( A.rows() );
10703 const size_t N( B.columns() );
10704 const size_t K( A.columns() );
10708 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
10711 const SIMDType factor(
set( scalar ) );
10715 if( IsIntegral_v<ElementType> )
10718 for(
size_t j=0UL; j<N; ++j )
10720 const size_t kbegin( ( IsLower_v<MT5> )
10721 ?( ( IsUpper_v<MT4> )
10722 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10723 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10724 :( IsUpper_v<MT4> ? i : 0UL ) );
10725 const size_t kend( ( IsUpper_v<MT5> )
10726 ?( ( IsLower_v<MT4> )
10727 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
10728 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
10729 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
10731 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10733 for(
size_t k=kbegin; k<kend; ++k ) {
10734 const SIMDType b1(
set( B(k,j) ) );
10735 xmm1 += A.load(i ,k) * b1;
10736 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10737 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10738 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10739 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
10740 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
10741 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
10742 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
10745 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10761 for( ; (j+2UL) <= N; j+=2UL )
10763 const size_t kbegin( ( IsLower_v<MT5> )
10764 ?( ( IsUpper_v<MT4> )
10765 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10766 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10767 :( IsUpper_v<MT4> ? i : 0UL ) );
10768 const size_t kend( ( IsUpper_v<MT5> )
10769 ?( ( IsLower_v<MT4> )
10770 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10771 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10772 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
10774 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
10776 for(
size_t k=kbegin; k<kend; ++k ) {
10777 const SIMDType a1( A.load(i ,k) );
10778 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10779 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10780 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
10781 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
10782 const SIMDType b1(
set( B(k,j ) ) );
10783 const SIMDType b2(
set( B(k,j+1UL) ) );
10796 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10801 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
10803 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
10804 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
10805 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
10810 const size_t kbegin( ( IsLower_v<MT5> )
10811 ?( ( IsUpper_v<MT4> )
10812 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10813 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10814 :( IsUpper_v<MT4> ? i : 0UL ) );
10815 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
10817 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
10819 for(
size_t k=kbegin; k<kend; ++k ) {
10820 const SIMDType b1(
set( B(k,j) ) );
10821 xmm1 += A.load(i ,k) * b1;
10822 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10823 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10824 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10825 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
10828 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10840 for( ; (j+2UL) <= N; j+=2UL )
10842 const size_t kbegin( ( IsLower_v<MT5> )
10843 ?( ( IsUpper_v<MT4> )
10844 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10845 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10846 :( IsUpper_v<MT4> ? i : 0UL ) );
10847 const size_t kend( ( IsUpper_v<MT5> )
10848 ?( ( IsLower_v<MT4> )
10849 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10850 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10851 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
10853 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10855 for(
size_t k=kbegin; k<kend; ++k ) {
10856 const SIMDType a1( A.load(i ,k) );
10857 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10858 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10859 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
10860 const SIMDType b1(
set( B(k,j ) ) );
10861 const SIMDType b2(
set( B(k,j+1UL) ) );
10872 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10876 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
10878 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
10879 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
10884 const size_t kbegin( ( IsLower_v<MT5> )
10885 ?( ( IsUpper_v<MT4> )
10886 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10887 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10888 :( IsUpper_v<MT4> ? i : 0UL ) );
10889 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
10891 SIMDType xmm1, xmm2, xmm3, xmm4;
10893 for(
size_t k=kbegin; k<kend; ++k ) {
10894 const SIMDType b1(
set( B(k,j) ) );
10895 xmm1 += A.load(i ,k) * b1;
10896 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10897 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10898 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
10901 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10912 for( ; (j+2UL) <= N; j+=2UL )
10914 const size_t kbegin( ( IsLower_v<MT5> )
10915 ?( ( IsUpper_v<MT4> )
10916 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10917 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10918 :( IsUpper_v<MT4> ? i : 0UL ) );
10919 const size_t kend( ( IsUpper_v<MT5> )
10920 ?( ( IsLower_v<MT4> )
10921 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
10922 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
10923 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
10925 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
10927 for(
size_t k=kbegin; k<kend; ++k ) {
10928 const SIMDType a1( A.load(i ,k) );
10929 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
10930 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
10931 const SIMDType b1(
set( B(k,j ) ) );
10932 const SIMDType b2(
set( B(k,j+1UL) ) );
10941 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10944 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
10946 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
10951 const size_t kbegin( ( IsLower_v<MT5> )
10952 ?( ( IsUpper_v<MT4> )
10953 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10954 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10955 :( IsUpper_v<MT4> ? i : 0UL ) );
10956 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
10958 SIMDType xmm1, xmm2, xmm3;
10960 for(
size_t k=kbegin; k<kend; ++k ) {
10961 const SIMDType b1(
set( B(k,j) ) );
10962 xmm1 += A.load(i ,k) * b1;
10963 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
10964 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
10967 C.store( i , j, C.load(i ,j) + xmm1 * factor );
10975 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
10976 size_t j( UPP ? i : 0UL );
10978 for( ; (j+4UL) <= jend; j+=4UL )
10980 const size_t kbegin( ( IsLower_v<MT5> )
10981 ?( ( IsUpper_v<MT4> )
10982 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10983 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10984 :( IsUpper_v<MT4> ? i : 0UL ) );
10985 const size_t kend( ( IsUpper_v<MT5> )
10986 ?( ( IsLower_v<MT4> )
10987 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
10988 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
10989 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
10991 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
10993 for(
size_t k=kbegin; k<kend; ++k ) {
10994 const SIMDType a1( A.load(i ,k) );
10995 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
10996 const SIMDType b1(
set( B(k,j ) ) );
10997 const SIMDType b2(
set( B(k,j+1UL) ) );
10998 const SIMDType b3(
set( B(k,j+2UL) ) );
10999 const SIMDType b4(
set( B(k,j+3UL) ) );
11010 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11012 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11014 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11016 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
11020 for( ; (j+3UL) <= jend; j+=3UL )
11022 const size_t kbegin( ( IsLower_v<MT5> )
11023 ?( ( IsUpper_v<MT4> )
11024 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11025 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11026 :( IsUpper_v<MT4> ? i : 0UL ) );
11027 const size_t kend( ( IsUpper_v<MT5> )
11028 ?( ( IsLower_v<MT4> )
11029 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
11030 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
11031 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
11033 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11035 for(
size_t k=kbegin; k<kend; ++k ) {
11036 const SIMDType a1( A.load(i ,k) );
11037 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
11038 const SIMDType b1(
set( B(k,j ) ) );
11039 const SIMDType b2(
set( B(k,j+1UL) ) );
11040 const SIMDType b3(
set( B(k,j+2UL) ) );
11049 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11051 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11053 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11057 for( ; (j+2UL) <= jend; j+=2UL )
11059 const size_t kbegin( ( IsLower_v<MT5> )
11060 ?( ( IsUpper_v<MT4> )
11061 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11062 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11063 :( IsUpper_v<MT4> ? i : 0UL ) );
11064 const size_t kend( ( IsUpper_v<MT5> )
11065 ?( ( IsLower_v<MT4> )
11066 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11067 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11068 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
11070 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11071 size_t k( kbegin );
11073 for( ; (k+2UL) <= kend; k+=2UL ) {
11074 const SIMDType a1( A.load(i ,k ) );
11075 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
11076 const SIMDType a3( A.load(i ,k+1UL) );
11077 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
11078 const SIMDType b1(
set( B(k ,j ) ) );
11079 const SIMDType b2(
set( B(k ,j+1UL) ) );
11080 const SIMDType b3(
set( B(k+1UL,j ) ) );
11081 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
11092 for( ; k<kend; ++k ) {
11093 const SIMDType a1( A.load(i ,k) );
11094 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
11095 const SIMDType b1(
set( B(k,j ) ) );
11096 const SIMDType b2(
set( B(k,j+1UL) ) );
11103 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
11105 C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
11106 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
11111 const size_t kbegin( ( IsLower_v<MT5> )
11112 ?( ( IsUpper_v<MT4> )
11113 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11114 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11115 :( IsUpper_v<MT4> ? i : 0UL ) );
11116 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
11118 SIMDType xmm1, xmm2, xmm3, xmm4;
11119 size_t k( kbegin );
11121 for( ; (k+2UL) <= kend; k+=2UL ) {
11122 const SIMDType b1(
set( B(k ,j) ) );
11123 const SIMDType b2(
set( B(k+1UL,j) ) );
11124 xmm1 += A.load(i ,k ) * b1;
11125 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
11126 xmm3 += A.load(i ,k+1UL) * b2;
11127 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
11130 for( ; k<kend; ++k ) {
11131 const SIMDType b1(
set( B(k,j) ) );
11132 xmm1 += A.load(i ,k) * b1;
11133 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
11136 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
11143 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
11144 size_t j( UPP ? i : 0UL );
11146 for( ; (j+4UL) <= jend; j+=4UL )
11148 const size_t kbegin( ( IsLower_v<MT5> )
11149 ?( ( IsUpper_v<MT4> )
11150 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11151 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11152 :( IsUpper_v<MT4> ? i : 0UL ) );
11153 const size_t kend( ( IsUpper_v<MT5> )
11154 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
11157 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11158 size_t k( kbegin );
11160 for( ; (k+2UL) <= kend; k+=2UL ) {
11161 const SIMDType a1( A.load(i,k ) );
11162 const SIMDType a2( A.load(i,k+1UL) );
11163 xmm1 += a1 *
set( B(k ,j ) );
11164 xmm2 += a1 *
set( B(k ,j+1UL) );
11165 xmm3 += a1 *
set( B(k ,j+2UL) );
11166 xmm4 += a1 *
set( B(k ,j+3UL) );
11167 xmm5 += a2 *
set( B(k+1UL,j ) );
11168 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
11169 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
11170 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
11173 for( ; k<kend; ++k ) {
11174 const SIMDType a1( A.load(i,k) );
11175 xmm1 += a1 *
set( B(k,j ) );
11176 xmm2 += a1 *
set( B(k,j+1UL) );
11177 xmm3 += a1 *
set( B(k,j+2UL) );
11178 xmm4 += a1 *
set( B(k,j+3UL) );
11181 C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
11182 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
11183 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
11184 C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
11187 for( ; (j+3UL) <= jend; j+=3UL )
11189 const size_t kbegin( ( IsLower_v<MT5> )
11190 ?( ( IsUpper_v<MT4> )
11191 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11192 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11193 :( IsUpper_v<MT4> ? i : 0UL ) );
11194 const size_t kend( ( IsUpper_v<MT5> )
11195 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
11198 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
11199 size_t k( kbegin );
11201 for( ; (k+2UL) <= kend; k+=2UL ) {
11202 const SIMDType a1( A.load(i,k ) );
11203 const SIMDType a2( A.load(i,k+1UL) );
11204 xmm1 += a1 *
set( B(k ,j ) );
11205 xmm2 += a1 *
set( B(k ,j+1UL) );
11206 xmm3 += a1 *
set( B(k ,j+2UL) );
11207 xmm4 += a2 *
set( B(k+1UL,j ) );
11208 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
11209 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
11212 for( ; k<kend; ++k ) {
11213 const SIMDType a1( A.load(i,k) );
11214 xmm1 += a1 *
set( B(k,j ) );
11215 xmm2 += a1 *
set( B(k,j+1UL) );
11216 xmm3 += a1 *
set( B(k,j+2UL) );
11219 C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
11220 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
11221 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
11224 for( ; (j+2UL) <= jend; j+=2UL )
11226 const size_t kbegin( ( IsLower_v<MT5> )
11227 ?( ( IsUpper_v<MT4> )
11228 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11229 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11230 :( IsUpper_v<MT4> ? i : 0UL ) );
11231 const size_t kend( ( IsUpper_v<MT5> )
11232 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
11235 SIMDType xmm1, xmm2, xmm3, xmm4;
11236 size_t k( kbegin );
11238 for( ; (k+2UL) <= kend; k+=2UL ) {
11239 const SIMDType a1( A.load(i,k ) );
11240 const SIMDType a2( A.load(i,k+1UL) );
11241 xmm1 += a1 *
set( B(k ,j ) );
11242 xmm2 += a1 *
set( B(k ,j+1UL) );
11243 xmm3 += a2 *
set( B(k+1UL,j ) );
11244 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
11247 for( ; k<kend; ++k ) {
11248 const SIMDType a1( A.load(i,k) );
11249 xmm1 += a1 *
set( B(k,j ) );
11250 xmm2 += a1 *
set( B(k,j+1UL) );
11253 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
11254 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
11259 const size_t kbegin( ( IsLower_v<MT5> )
11260 ?( ( IsUpper_v<MT4> )
11261 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11262 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11263 :( IsUpper_v<MT4> ? i : 0UL ) );
11265 SIMDType xmm1, xmm2;
11266 size_t k( kbegin );
11268 for( ; (k+2UL) <= K; k+=2UL ) {
11269 xmm1 += A.load(i,k ) *
set( B(k ,j) );
11270 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
11273 for( ; k<K; ++k ) {
11274 xmm1 += A.load(i,k) *
set( B(k,j) );
11277 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
11281 for( ; remainder && i<M; ++i )
11283 const size_t jend( LOW ? i+1UL : N );
11284 size_t j( UPP ? i : 0UL );
11286 for( ; (j+2UL) <= jend; j+=2UL )
11288 const size_t kbegin( ( IsLower_v<MT5> )
11289 ?( ( IsUpper_v<MT4> )
11290 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11291 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11292 :( IsUpper_v<MT4> ? i : 0UL ) );
11293 const size_t kend( ( IsUpper_v<MT5> )
11294 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
11300 for(
size_t k=kbegin; k<kend; ++k ) {
11301 value1 += A(i,k) * B(k,j );
11302 value2 += A(i,k) * B(k,j+1UL);
11305 C(i,j ) += value1 * scalar;
11306 C(i,j+1UL) += value2 * scalar;
11311 const size_t kbegin( ( IsLower_v<MT5> )
11312 ?( ( IsUpper_v<MT4> )
11313 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11314 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11315 :( IsUpper_v<MT4> ? i : 0UL ) );
11319 for(
size_t k=kbegin; k<K; ++k ) {
11320 value += A(i,k) * B(k,j);
11323 C(i,j) += value * scalar;
11343 template<
typename MT3
11347 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11348 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11350 selectDefaultAddAssignKernel( C, A, B, scalar );
11369 template<
typename MT3
11373 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11374 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11377 lmmm( C, A, B, scalar, ST2(1) );
11379 ummm( C, A, B, scalar, ST2(1) );
11381 mmm( C, A, B, scalar, ST2(1) );
11399 template<
typename MT3
11403 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11404 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11406 selectLargeAddAssignKernel( C, A, B, scalar );
11411 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 11425 template<
typename MT3
11429 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11430 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11432 using ET = ElementType_t<MT3>;
11434 if( IsTriangular_v<MT4> ) {
11435 ResultType_t<MT3> tmp(
serial( B ) );
11436 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11437 addAssign( C, tmp );
11439 else if( IsTriangular_v<MT5> ) {
11440 ResultType_t<MT3> tmp(
serial( A ) );
11441 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11442 addAssign( C, tmp );
11445 gemm( C, A, B,
ET(scalar),
ET(1) );
11467 template<
typename MT
11469 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11476 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11477 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11479 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
11493 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
11508 template<
typename MT3
11512 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11514 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
11515 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
11516 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
11517 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
11518 selectSmallSubAssignKernel( C, A, B, scalar );
11520 selectBlasSubAssignKernel( C, A, B, scalar );
11538 template<
typename MT3
11542 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11543 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11546 subAssign( C, tmp );
11564 template<
typename MT3
11568 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11569 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11571 constexpr
size_t block( BLOCK_SIZE );
11573 const size_t M( A.rows() );
11574 const size_t N( B.columns() );
11576 for(
size_t ii=0UL; ii<M; ii+=block ) {
11577 const size_t iend(
min( M, ii+block ) );
11578 for(
size_t jj=0UL; jj<N; jj+=block ) {
11579 const size_t jend(
min( N, jj+block ) );
11580 for(
size_t i=ii; i<iend; ++i )
11582 const size_t jbegin( ( IsUpper_v<MT4> )
11583 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
11585 const size_t jpos( ( IsLower_v<MT4> )
11586 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
11589 for(
size_t j=jbegin; j<jpos; ++j ) {
11590 C(i,j) -= A(i,j) * B(j,j) * scalar;
11612 template<
typename MT3
11616 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11617 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11619 const size_t M( A.rows() );
11620 const size_t N( B.columns() );
11622 for(
size_t j=0UL; j<N; ++j )
11624 const size_t ibegin( ( IsLower_v<MT4> )
11625 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
11627 const size_t iend( ( IsUpper_v<MT4> )
11628 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
11632 const size_t inum( iend - ibegin );
11633 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
11635 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
11636 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
11637 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
11639 if( ipos < iend ) {
11640 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
11660 template<
typename MT3
11664 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11665 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11667 const size_t M( A.rows() );
11668 const size_t N( B.columns() );
11670 for(
size_t i=0UL; i<M; ++i )
11672 const size_t jbegin( ( IsUpper_v<MT5> )
11673 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
11675 const size_t jend( ( IsLower_v<MT5> )
11676 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
11680 const size_t jnum( jend - jbegin );
11681 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
11683 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
11684 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
11685 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
11687 if( jpos < jend ) {
11688 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
11708 template<
typename MT3
11712 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11713 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11715 constexpr
size_t block( BLOCK_SIZE );
11717 const size_t M( A.rows() );
11718 const size_t N( B.columns() );
11720 for(
size_t jj=0UL; jj<N; jj+=block ) {
11721 const size_t jend(
min( N, jj+block ) );
11722 for(
size_t ii=0UL; ii<M; ii+=block ) {
11723 const size_t iend(
min( M, ii+block ) );
11724 for(
size_t j=jj; j<jend; ++j )
11726 const size_t ibegin( ( IsLower_v<MT5> )
11727 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
11729 const size_t ipos( ( IsUpper_v<MT5> )
11730 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
11733 for(
size_t i=ibegin; i<ipos; ++i ) {
11734 C(i,j) -= A(i,i) * B(i,j) * scalar;
11756 template<
typename MT3
11760 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11761 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11763 for(
size_t i=0UL; i<A.rows(); ++i ) {
11764 C(i,i) -= A(i,i) * B(i,i) * scalar;
11783 template<
typename MT3
11787 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11788 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11790 selectDefaultSubAssignKernel( C, A, B, scalar );
11809 template<
typename MT3
11813 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11814 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11816 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
11818 const size_t M( A.rows() );
11819 const size_t N( B.columns() );
11820 const size_t K( A.columns() );
11824 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
11827 const SIMDType factor(
set( scalar ) );
11831 if( IsIntegral_v<ElementType> )
11834 for(
size_t i=0UL; i<M; ++i )
11836 const size_t kbegin( ( IsUpper_v<MT4> )
11837 ?( ( IsLower_v<MT5> )
11838 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11839 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11840 :( IsLower_v<MT5> ? j : 0UL ) );
11841 const size_t kend( ( IsLower_v<MT4> )
11842 ?( ( IsUpper_v<MT5> )
11843 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
11844 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
11845 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
11847 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11849 for(
size_t k=kbegin; k<kend; ++k ) {
11850 const SIMDType a1(
set( A(i,k) ) );
11851 xmm1 += a1 * B.load(k,j );
11852 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11853 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11854 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11855 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
11856 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
11857 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
11858 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
11861 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11877 for( ; (i+2UL) <= M; i+=2UL )
11879 const size_t kbegin( ( IsUpper_v<MT4> )
11880 ?( ( IsLower_v<MT5> )
11881 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11882 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11883 :( IsLower_v<MT5> ? j : 0UL ) );
11884 const size_t kend( ( IsLower_v<MT4> )
11885 ?( ( IsUpper_v<MT5> )
11886 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
11887 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11888 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
11890 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
11892 for(
size_t k=kbegin; k<kend; ++k ) {
11893 const SIMDType a1(
set( A(i ,k) ) );
11894 const SIMDType a2(
set( A(i+1UL,k) ) );
11895 const SIMDType b1( B.load(k,j ) );
11896 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11897 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11898 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
11899 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
11912 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11917 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
11919 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
11920 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
11921 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
11926 const size_t kbegin( ( IsUpper_v<MT4> )
11927 ?( ( IsLower_v<MT5> )
11928 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11929 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11930 :( IsLower_v<MT5> ? j : 0UL ) );
11931 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
11933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
11935 for(
size_t k=kbegin; k<kend; ++k ) {
11936 const SIMDType a1(
set( A(i,k) ) );
11937 xmm1 += a1 * B.load(k,j );
11938 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11939 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11940 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11941 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
11944 C.store( i, j , C.load(i,j ) - xmm1 * factor );
11956 for( ; (i+2UL) <= M; i+=2UL )
11958 const size_t kbegin( ( IsUpper_v<MT4> )
11959 ?( ( IsLower_v<MT5> )
11960 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11961 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11962 :( IsLower_v<MT5> ? j : 0UL ) );
11963 const size_t kend( ( IsLower_v<MT4> )
11964 ?( ( IsUpper_v<MT5> )
11965 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
11966 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11967 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
11969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
11971 for(
size_t k=kbegin; k<kend; ++k ) {
11972 const SIMDType a1(
set( A(i ,k) ) );
11973 const SIMDType a2(
set( A(i+1UL,k) ) );
11974 const SIMDType b1( B.load(k,j ) );
11975 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11976 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11977 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
11988 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
11992 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
11994 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
11995 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
12000 const size_t kbegin( ( IsUpper_v<MT4> )
12001 ?( ( IsLower_v<MT5> )
12002 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12003 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12004 :( IsLower_v<MT5> ? j : 0UL ) );
12005 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
12007 SIMDType xmm1, xmm2, xmm3, xmm4;
12009 for(
size_t k=kbegin; k<kend; ++k ) {
12010 const SIMDType a1(
set( A(i,k) ) );
12011 xmm1 += a1 * B.load(k,j );
12012 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
12013 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
12014 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
12017 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12028 for( ; (i+2UL) <= M; i+=2UL )
12030 const size_t kbegin( ( IsUpper_v<MT4> )
12031 ?( ( IsLower_v<MT5> )
12032 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12033 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12034 :( IsLower_v<MT5> ? j : 0UL ) );
12035 const size_t kend( ( IsLower_v<MT4> )
12036 ?( ( IsUpper_v<MT5> )
12037 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
12038 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12039 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
12041 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12043 for(
size_t k=kbegin; k<kend; ++k ) {
12044 const SIMDType a1(
set( A(i ,k) ) );
12045 const SIMDType a2(
set( A(i+1UL,k) ) );
12046 const SIMDType b1( B.load(k,j ) );
12047 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
12048 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
12057 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12060 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
12062 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
12067 const size_t kbegin( ( IsUpper_v<MT4> )
12068 ?( ( IsLower_v<MT5> )
12069 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12070 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12071 :( IsLower_v<MT5> ? j : 0UL ) );
12072 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
12074 SIMDType xmm1, xmm2, xmm3;
12076 for(
size_t k=kbegin; k<kend; ++k ) {
12077 const SIMDType a1(
set( A(i,k) ) );
12078 xmm1 += a1 * B.load(k,j );
12079 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
12080 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
12083 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12091 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
12092 size_t i( LOW ? j : 0UL );
12094 for( ; (i+4UL) <= iend; i+=4UL )
12096 const size_t kbegin( ( IsUpper_v<MT4> )
12097 ?( ( IsLower_v<MT5> )
12098 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12099 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12100 :( IsLower_v<MT5> ? j : 0UL ) );
12101 const size_t kend( ( IsLower_v<MT4> )
12102 ?( ( IsUpper_v<MT5> )
12103 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
12104 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
12105 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
12107 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12109 for(
size_t k=kbegin; k<kend; ++k ) {
12110 const SIMDType a1(
set( A(i ,k) ) );
12111 const SIMDType a2(
set( A(i+1UL,k) ) );
12112 const SIMDType a3(
set( A(i+2UL,k) ) );
12113 const SIMDType a4(
set( A(i+3UL,k) ) );
12114 const SIMDType b1( B.load(k,j ) );
12115 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
12126 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12128 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
12130 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
12132 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
12136 for( ; (i+3UL) <= iend; i+=3UL )
12138 const size_t kbegin( ( IsUpper_v<MT4> )
12139 ?( ( IsLower_v<MT5> )
12140 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12141 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12142 :( IsLower_v<MT5> ? j : 0UL ) );
12143 const size_t kend( ( IsLower_v<MT4> )
12144 ?( ( IsUpper_v<MT5> )
12145 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
12146 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
12147 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
12149 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12151 for(
size_t k=kbegin; k<kend; ++k ) {
12152 const SIMDType a1(
set( A(i ,k) ) );
12153 const SIMDType a2(
set( A(i+1UL,k) ) );
12154 const SIMDType a3(
set( A(i+2UL,k) ) );
12155 const SIMDType b1( B.load(k,j ) );
12156 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
12165 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12167 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
12169 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
12173 for( ; (i+2UL) <= iend; i+=2UL )
12175 const size_t kbegin( ( IsUpper_v<MT4> )
12176 ?( ( IsLower_v<MT5> )
12177 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12178 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12179 :( IsLower_v<MT5> ? j : 0UL ) );
12180 const size_t kend( ( IsLower_v<MT4> )
12181 ?( ( IsUpper_v<MT5> )
12182 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
12183 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12184 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
12186 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12187 size_t k( kbegin );
12189 for( ; (k+2UL) <= kend; k+=2UL ) {
12190 const SIMDType a1(
set( A(i ,k ) ) );
12191 const SIMDType a2(
set( A(i+1UL,k ) ) );
12192 const SIMDType a3(
set( A(i ,k+1UL) ) );
12193 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
12194 const SIMDType b1( B.load(k ,j ) );
12195 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
12196 const SIMDType b3( B.load(k+1UL,j ) );
12197 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
12208 for( ; k<kend; ++k ) {
12209 const SIMDType a1(
set( A(i ,k) ) );
12210 const SIMDType a2(
set( A(i+1UL,k) ) );
12211 const SIMDType b1( B.load(k,j ) );
12212 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
12219 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12221 C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
12222 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) - (xmm4+xmm8) * factor );
12227 const size_t kbegin( ( IsUpper_v<MT4> )
12228 ?( ( IsLower_v<MT5> )
12229 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12230 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12231 :( IsLower_v<MT5> ? j : 0UL ) );
12232 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
12234 SIMDType xmm1, xmm2, xmm3, xmm4;
12235 size_t k( kbegin );
12237 for( ; (k+2UL) <= kend; k+=2UL ) {
12238 const SIMDType a1(
set( A(i,k ) ) );
12239 const SIMDType a2(
set( A(i,k+1UL) ) );
12240 xmm1 += a1 * B.load(k ,j );
12241 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
12242 xmm3 += a2 * B.load(k+1UL,j );
12243 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
12246 for( ; k<kend; ++k ) {
12247 const SIMDType a1(
set( A(i,k) ) );
12248 xmm1 += a1 * B.load(k,j );
12249 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
12252 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
12259 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
12260 size_t i( LOW ? j : 0UL );
12262 for( ; (i+4UL) <= iend; i+=4UL )
12264 const size_t kbegin( ( IsUpper_v<MT4> )
12265 ?( ( IsLower_v<MT5> )
12266 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12267 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12268 :( IsLower_v<MT5> ? j : 0UL ) );
12269 const size_t kend( ( IsLower_v<MT4> )
12270 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
12273 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12274 size_t k( kbegin );
12276 for( ; (k+2UL) <= kend; k+=2UL ) {
12277 const SIMDType b1( B.load(k ,j) );
12278 const SIMDType b2( B.load(k+1UL,j) );
12279 xmm1 +=
set( A(i ,k ) ) * b1;
12280 xmm2 +=
set( A(i+1UL,k ) ) * b1;
12281 xmm3 +=
set( A(i+2UL,k ) ) * b1;
12282 xmm4 +=
set( A(i+3UL,k ) ) * b1;
12283 xmm5 +=
set( A(i ,k+1UL) ) * b2;
12284 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
12285 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
12286 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
12289 for( ; k<kend; ++k ) {
12290 const SIMDType b1( B.load(k,j) );
12291 xmm1 +=
set( A(i ,k) ) * b1;
12292 xmm2 +=
set( A(i+1UL,k) ) * b1;
12293 xmm3 +=
set( A(i+2UL,k) ) * b1;
12294 xmm4 +=
set( A(i+3UL,k) ) * b1;
12297 C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
12298 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
12299 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
12300 C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
12303 for( ; (i+3UL) <= iend; i+=3UL )
12305 const size_t kbegin( ( IsUpper_v<MT4> )
12306 ?( ( IsLower_v<MT5> )
12307 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12308 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12309 :( IsLower_v<MT5> ? j : 0UL ) );
12310 const size_t kend( ( IsLower_v<MT4> )
12311 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
12314 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12315 size_t k( kbegin );
12317 for( ; (k+2UL) <= kend; k+=2UL ) {
12318 const SIMDType b1( B.load(k ,j) );
12319 const SIMDType b2( B.load(k+1UL,j) );
12320 xmm1 +=
set( A(i ,k ) ) * b1;
12321 xmm2 +=
set( A(i+1UL,k ) ) * b1;
12322 xmm3 +=
set( A(i+2UL,k ) ) * b1;
12323 xmm4 +=
set( A(i ,k+1UL) ) * b2;
12324 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
12325 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
12328 for( ; k<kend; ++k ) {
12329 const SIMDType b1( B.load(k,j) );
12330 xmm1 +=
set( A(i ,k) ) * b1;
12331 xmm2 +=
set( A(i+1UL,k) ) * b1;
12332 xmm3 +=
set( A(i+2UL,k) ) * b1;
12335 C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
12336 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
12337 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
12340 for( ; (i+2UL) <= iend; i+=2UL )
12342 const size_t kbegin( ( IsUpper_v<MT4> )
12343 ?( ( IsLower_v<MT5> )
12344 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12345 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12346 :( IsLower_v<MT5> ? j : 0UL ) );
12347 const size_t kend( ( IsLower_v<MT4> )
12348 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12351 SIMDType xmm1, xmm2, xmm3, xmm4;
12352 size_t k( kbegin );
12354 for( ; (k+2UL) <= kend; k+=2UL ) {
12355 const SIMDType b1( B.load(k ,j) );
12356 const SIMDType b2( B.load(k+1UL,j) );
12357 xmm1 +=
set( A(i ,k ) ) * b1;
12358 xmm2 +=
set( A(i+1UL,k ) ) * b1;
12359 xmm3 +=
set( A(i ,k+1UL) ) * b2;
12360 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
12363 for( ; k<kend; ++k ) {
12364 const SIMDType b1( B.load(k,j) );
12365 xmm1 +=
set( A(i ,k) ) * b1;
12366 xmm2 +=
set( A(i+1UL,k) ) * b1;
12369 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12370 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
12375 const size_t kbegin( ( IsUpper_v<MT4> )
12376 ?( ( IsLower_v<MT5> )
12377 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12378 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12379 :( IsLower_v<MT5> ? j : 0UL ) );
12381 SIMDType xmm1, xmm2;
12382 size_t k( kbegin );
12384 for( ; (k+2UL) <= K; k+=2UL ) {
12385 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
12386 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
12389 for( ; k<K; ++k ) {
12390 xmm1 +=
set( A(i,k) ) * B.load(k,j);
12393 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
12397 for( ; remainder && j<N; ++j )
12399 const size_t iend( UPP ? j+1UL : M );
12400 size_t i( LOW ? j : 0UL );
12402 for( ; (i+2UL) <= iend; i+=2UL )
12404 const size_t kbegin( ( IsUpper_v<MT4> )
12405 ?( ( IsLower_v<MT5> )
12406 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12407 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12408 :( IsLower_v<MT5> ? j : 0UL ) );
12409 const size_t kend( ( IsLower_v<MT4> )
12410 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
12416 for(
size_t k=kbegin; k<kend; ++k ) {
12417 value1 += A(i ,k) * B(k,j);
12418 value2 += A(i+1UL,k) * B(k,j);
12421 C(i ,j) -= value1 * scalar;
12422 C(i+1UL,j) -= value2 * scalar;
12427 const size_t kbegin( ( IsUpper_v<MT4> )
12428 ?( ( IsLower_v<MT5> )
12429 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12430 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12431 :( IsLower_v<MT5> ? j : 0UL ) );
12435 for(
size_t k=kbegin; k<K; ++k ) {
12436 value += A(i,k) * B(k,j);
12439 C(i,j) -= value * scalar;
12460 template<
typename MT3
12464 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12465 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12467 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
12469 const size_t M( A.rows() );
12470 const size_t N( B.columns() );
12471 const size_t K( A.columns() );
12475 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
12478 const SIMDType factor(
set( scalar ) );
12482 if( IsIntegral_v<ElementType> )
12485 for(
size_t j=0UL; j<N; ++j )
12487 const size_t kbegin( ( IsLower_v<MT5> )
12488 ?( ( IsUpper_v<MT4> )
12489 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12490 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12491 :( IsUpper_v<MT4> ? i : 0UL ) );
12492 const size_t kend( ( IsUpper_v<MT5> )
12493 ?( ( IsLower_v<MT4> )
12494 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
12495 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
12496 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
12498 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12500 for(
size_t k=kbegin; k<kend; ++k ) {
12501 const SIMDType b1(
set( B(k,j) ) );
12502 xmm1 += A.load(i ,k) * b1;
12503 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12504 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12505 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12506 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
12507 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
12508 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
12509 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
12512 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12528 for( ; (j+2UL) <= N; j+=2UL )
12530 const size_t kbegin( ( IsLower_v<MT5> )
12531 ?( ( IsUpper_v<MT4> )
12532 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12533 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12534 :( IsUpper_v<MT4> ? i : 0UL ) );
12535 const size_t kend( ( IsUpper_v<MT5> )
12536 ?( ( IsLower_v<MT4> )
12537 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12538 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12539 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
12541 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
12543 for(
size_t k=kbegin; k<kend; ++k ) {
12544 const SIMDType a1( A.load(i ,k) );
12545 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12546 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12547 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
12548 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
12549 const SIMDType b1(
set( B(k,j ) ) );
12550 const SIMDType b2(
set( B(k,j+1UL) ) );
12563 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12568 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
12570 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
12571 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
12572 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
12577 const size_t kbegin( ( IsLower_v<MT5> )
12578 ?( ( IsUpper_v<MT4> )
12579 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12580 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12581 :( IsUpper_v<MT4> ? i : 0UL ) );
12582 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
12584 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
12586 for(
size_t k=kbegin; k<kend; ++k ) {
12587 const SIMDType b1(
set( B(k,j) ) );
12588 xmm1 += A.load(i ,k) * b1;
12589 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12590 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12591 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12592 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
12595 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12607 for( ; (j+2UL) <= N; j+=2UL )
12609 const size_t kbegin( ( IsLower_v<MT5> )
12610 ?( ( IsUpper_v<MT4> )
12611 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12612 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12613 :( IsUpper_v<MT4> ? i : 0UL ) );
12614 const size_t kend( ( IsUpper_v<MT5> )
12615 ?( ( IsLower_v<MT4> )
12616 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12617 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12618 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
12620 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12622 for(
size_t k=kbegin; k<kend; ++k ) {
12623 const SIMDType a1( A.load(i ,k) );
12624 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12625 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12626 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
12627 const SIMDType b1(
set( B(k,j ) ) );
12628 const SIMDType b2(
set( B(k,j+1UL) ) );
12639 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12643 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
12645 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
12646 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
12651 const size_t kbegin( ( IsLower_v<MT5> )
12652 ?( ( IsUpper_v<MT4> )
12653 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12654 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12655 :( IsUpper_v<MT4> ? i : 0UL ) );
12656 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
12658 SIMDType xmm1, xmm2, xmm3, xmm4;
12660 for(
size_t k=kbegin; k<kend; ++k ) {
12661 const SIMDType b1(
set( B(k,j) ) );
12662 xmm1 += A.load(i ,k) * b1;
12663 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12664 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12665 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
12668 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12679 for( ; (j+2UL) <= N; j+=2UL )
12681 const size_t kbegin( ( IsLower_v<MT5> )
12682 ?( ( IsUpper_v<MT4> )
12683 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12684 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12685 :( IsUpper_v<MT4> ? i : 0UL ) );
12686 const size_t kend( ( IsUpper_v<MT5> )
12687 ?( ( IsLower_v<MT4> )
12688 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12689 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12690 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
12692 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12694 for(
size_t k=kbegin; k<kend; ++k ) {
12695 const SIMDType a1( A.load(i ,k) );
12696 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
12697 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
12698 const SIMDType b1(
set( B(k,j ) ) );
12699 const SIMDType b2(
set( B(k,j+1UL) ) );
12708 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12711 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
12713 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
12718 const size_t kbegin( ( IsLower_v<MT5> )
12719 ?( ( IsUpper_v<MT4> )
12720 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12721 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12722 :( IsUpper_v<MT4> ? i : 0UL ) );
12723 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
12725 SIMDType xmm1, xmm2, xmm3;
12727 for(
size_t k=kbegin; k<kend; ++k ) {
12728 const SIMDType b1(
set( B(k,j) ) );
12729 xmm1 += A.load(i ,k) * b1;
12730 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
12731 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
12734 C.store( i , j, C.load(i ,j) - xmm1 * factor );
12742 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
12743 size_t j( UPP ? i : 0UL );
12745 for( ; (j+4UL) <= jend; j+=4UL )
12747 const size_t kbegin( ( IsLower_v<MT5> )
12748 ?( ( IsUpper_v<MT4> )
12749 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12750 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12751 :( IsUpper_v<MT4> ? i : 0UL ) );
12752 const size_t kend( ( IsUpper_v<MT5> )
12753 ?( ( IsLower_v<MT4> )
12754 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
12755 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
12756 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12758 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12760 for(
size_t k=kbegin; k<kend; ++k ) {
12761 const SIMDType a1( A.load(i ,k) );
12762 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12763 const SIMDType b1(
set( B(k,j ) ) );
12764 const SIMDType b2(
set( B(k,j+1UL) ) );
12765 const SIMDType b3(
set( B(k,j+2UL) ) );
12766 const SIMDType b4(
set( B(k,j+3UL) ) );
12777 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12779 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12781 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12783 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
12787 for( ; (j+3UL) <= jend; j+=3UL )
12789 const size_t kbegin( ( IsLower_v<MT5> )
12790 ?( ( IsUpper_v<MT4> )
12791 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12792 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12793 :( IsUpper_v<MT4> ? i : 0UL ) );
12794 const size_t kend( ( IsUpper_v<MT5> )
12795 ?( ( IsLower_v<MT4> )
12796 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
12797 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
12798 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12800 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12802 for(
size_t k=kbegin; k<kend; ++k ) {
12803 const SIMDType a1( A.load(i ,k) );
12804 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12805 const SIMDType b1(
set( B(k,j ) ) );
12806 const SIMDType b2(
set( B(k,j+1UL) ) );
12807 const SIMDType b3(
set( B(k,j+2UL) ) );
12816 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12818 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
12820 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
12824 for( ; (j+2UL) <= jend; j+=2UL )
12826 const size_t kbegin( ( IsLower_v<MT5> )
12827 ?( ( IsUpper_v<MT4> )
12828 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12829 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12830 :( IsUpper_v<MT4> ? i : 0UL ) );
12831 const size_t kend( ( IsUpper_v<MT5> )
12832 ?( ( IsLower_v<MT4> )
12833 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12834 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12835 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12837 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12838 size_t k( kbegin );
12840 for( ; (k+2UL) <= kend; k+=2UL ) {
12841 const SIMDType a1( A.load(i ,k ) );
12842 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
12843 const SIMDType a3( A.load(i ,k+1UL) );
12844 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
12845 const SIMDType b1(
set( B(k ,j ) ) );
12846 const SIMDType b2(
set( B(k ,j+1UL) ) );
12847 const SIMDType b3(
set( B(k+1UL,j ) ) );
12848 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
12859 for( ; k<kend; ++k ) {
12860 const SIMDType a1( A.load(i ,k) );
12861 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
12862 const SIMDType b1(
set( B(k,j ) ) );
12863 const SIMDType b2(
set( B(k,j+1UL) ) );
12870 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
12872 C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
12873 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
12878 const size_t kbegin( ( IsLower_v<MT5> )
12879 ?( ( IsUpper_v<MT4> )
12880 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12881 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12882 :( IsUpper_v<MT4> ? i : 0UL ) );
12883 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
12885 SIMDType xmm1, xmm2, xmm3, xmm4;
12886 size_t k( kbegin );
12888 for( ; (k+2UL) <= kend; k+=2UL ) {
12889 const SIMDType b1(
set( B(k ,j) ) );
12890 const SIMDType b2(
set( B(k+1UL,j) ) );
12891 xmm1 += A.load(i ,k ) * b1;
12892 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
12893 xmm3 += A.load(i ,k+1UL) * b2;
12894 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
12897 for( ; k<kend; ++k ) {
12898 const SIMDType b1(
set( B(k,j) ) );
12899 xmm1 += A.load(i ,k) * b1;
12900 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
12903 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
12910 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
12911 size_t j( UPP ? i : 0UL );
12913 for( ; (j+4UL) <= jend; j+=4UL )
12915 const size_t kbegin( ( IsLower_v<MT5> )
12916 ?( ( IsUpper_v<MT4> )
12917 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12918 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12919 :( IsUpper_v<MT4> ? i : 0UL ) );
12920 const size_t kend( ( IsUpper_v<MT5> )
12921 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12924 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
12925 size_t k( kbegin );
12927 for( ; (k+2UL) <= kend; k+=2UL ) {
12928 const SIMDType a1( A.load(i,k ) );
12929 const SIMDType a2( A.load(i,k+1UL) );
12930 xmm1 += a1 *
set( B(k ,j ) );
12931 xmm2 += a1 *
set( B(k ,j+1UL) );
12932 xmm3 += a1 *
set( B(k ,j+2UL) );
12933 xmm4 += a1 *
set( B(k ,j+3UL) );
12934 xmm5 += a2 *
set( B(k+1UL,j ) );
12935 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
12936 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
12937 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
12940 for( ; k<kend; ++k ) {
12941 const SIMDType a1( A.load(i,k) );
12942 xmm1 += a1 *
set( B(k,j ) );
12943 xmm2 += a1 *
set( B(k,j+1UL) );
12944 xmm3 += a1 *
set( B(k,j+2UL) );
12945 xmm4 += a1 *
set( B(k,j+3UL) );
12948 C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
12949 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
12950 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
12951 C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
12954 for( ; (j+3UL) <= jend; j+=3UL )
12956 const size_t kbegin( ( IsLower_v<MT5> )
12957 ?( ( IsUpper_v<MT4> )
12958 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12959 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12960 :( IsUpper_v<MT4> ? i : 0UL ) );
12961 const size_t kend( ( IsUpper_v<MT5> )
12962 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12965 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
12966 size_t k( kbegin );
12968 for( ; (k+2UL) <= kend; k+=2UL ) {
12969 const SIMDType a1( A.load(i,k ) );
12970 const SIMDType a2( A.load(i,k+1UL) );
12971 xmm1 += a1 *
set( B(k ,j ) );
12972 xmm2 += a1 *
set( B(k ,j+1UL) );
12973 xmm3 += a1 *
set( B(k ,j+2UL) );
12974 xmm4 += a2 *
set( B(k+1UL,j ) );
12975 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
12976 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
12979 for( ; k<kend; ++k ) {
12980 const SIMDType a1( A.load(i,k) );
12981 xmm1 += a1 *
set( B(k,j ) );
12982 xmm2 += a1 *
set( B(k,j+1UL) );
12983 xmm3 += a1 *
set( B(k,j+2UL) );
12986 C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
12987 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
12988 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
12991 for( ; (j+2UL) <= jend; j+=2UL )
12993 const size_t kbegin( ( IsLower_v<MT5> )
12994 ?( ( IsUpper_v<MT4> )
12995 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12996 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12997 :( IsUpper_v<MT4> ? i : 0UL ) );
12998 const size_t kend( ( IsUpper_v<MT5> )
12999 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
13002 SIMDType xmm1, xmm2, xmm3, xmm4;
13003 size_t k( kbegin );
13005 for( ; (k+2UL) <= kend; k+=2UL ) {
13006 const SIMDType a1( A.load(i,k ) );
13007 const SIMDType a2( A.load(i,k+1UL) );
13008 xmm1 += a1 *
set( B(k ,j ) );
13009 xmm2 += a1 *
set( B(k ,j+1UL) );
13010 xmm3 += a2 *
set( B(k+1UL,j ) );
13011 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
13014 for( ; k<kend; ++k ) {
13015 const SIMDType a1( A.load(i,k) );
13016 xmm1 += a1 *
set( B(k,j ) );
13017 xmm2 += a1 *
set( B(k,j+1UL) );
13020 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
13021 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
13026 const size_t kbegin( ( IsLower_v<MT5> )
13027 ?( ( IsUpper_v<MT4> )
13028 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13029 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13030 :( IsUpper_v<MT4> ? i : 0UL ) );
13032 SIMDType xmm1, xmm2;
13033 size_t k( kbegin );
13035 for( ; (k+2UL) <= K; k+=2UL ) {
13036 xmm1 += A.load(i,k ) *
set( B(k ,j) );
13037 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
13040 for( ; k<K; ++k ) {
13041 xmm1 += A.load(i,k) *
set( B(k,j) );
13044 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
13048 for( ; remainder && i<M; ++i )
13050 const size_t jend( LOW ? i+1UL : N );
13051 size_t j( UPP ? i : 0UL );
13053 for( ; (j+2UL) <= jend; j+=2UL )
13055 const size_t kbegin( ( IsLower_v<MT5> )
13056 ?( ( IsUpper_v<MT4> )
13057 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13058 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13059 :( IsUpper_v<MT4> ? i : 0UL ) );
13060 const size_t kend( ( IsUpper_v<MT5> )
13061 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
13067 for(
size_t k=kbegin; k<kend; ++k ) {
13068 value1 += A(i,k) * B(k,j );
13069 value2 += A(i,k) * B(k,j+1UL);
13072 C(i,j ) -= value1 * scalar;
13073 C(i,j+1UL) -= value2 * scalar;
13078 const size_t kbegin( ( IsLower_v<MT5> )
13079 ?( ( IsUpper_v<MT4> )
13080 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13081 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13082 :( IsUpper_v<MT4> ? i : 0UL ) );
13086 for(
size_t k=kbegin; k<K; ++k ) {
13087 value += A(i,k) * B(k,j);
13090 C(i,j) -= value * scalar;
13110 template<
typename MT3
13114 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
13115 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13117 selectDefaultSubAssignKernel( C, A, B, scalar );
13136 template<
typename MT3
13140 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
13141 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13144 lmmm( C, A, B, -scalar, ST2(1) );
13146 ummm( C, A, B, -scalar, ST2(1) );
13148 mmm( C, A, B, -scalar, ST2(1) );
13166 template<
typename MT3
13170 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
13171 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
13173 selectLargeSubAssignKernel( C, A, B, scalar );
13178 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 13192 template<
typename MT3
13196 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
13197 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
13199 using ET = ElementType_t<MT3>;
13201 if( IsTriangular_v<MT4> ) {
13202 ResultType_t<MT3> tmp(
serial( B ) );
13203 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
13204 subAssign( C, tmp );
13206 else if( IsTriangular_v<MT5> ) {
13207 ResultType_t<MT3> tmp(
serial( A ) );
13208 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
13209 subAssign( C, tmp );
13212 gemm( C, A, B,
ET(-scalar),
ET(1) );
13234 template<
typename MT
13236 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
13248 schurAssign( ~lhs, tmp );
13279 template<
typename MT
13282 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13289 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13290 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13292 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
13295 else if( left.columns() == 0UL ) {
13310 smpAssign( ~lhs, A * B * rhs.scalar_ );
13329 template<
typename MT
13332 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13336 using TmpType = If_t< SO, ResultType, OppositeType >;
13348 const ForwardFunctor fwd;
13350 const TmpType tmp( rhs );
13370 template<
typename MT
13373 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13380 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13381 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13383 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13420 template<
typename MT
13423 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13430 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13431 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13433 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
13467 template<
typename MT
13550 template<
typename MT1
13552 inline decltype(
auto)
13562 return ReturnType( ~lhs, ~rhs );
13600 template<
typename MT1
13606 inline decltype(
auto)
declsym( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13614 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13615 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13646 template<
typename MT1
13652 inline decltype(
auto)
declherm( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13660 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13661 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13692 template<
typename MT1
13698 inline decltype(
auto)
decllow( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13706 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13707 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13738 template<
typename MT1
13744 inline decltype(
auto)
declupp( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13752 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13753 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13784 template<
typename MT1
13790 inline decltype(
auto)
decldiag( const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13798 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13799 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13815 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13816 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13817 :
public Size<MT1,0UL>
13820 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13821 struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13822 :
public Size<MT2,1UL>
13838 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13839 struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13840 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:166
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:272
Header file for basic type definitions.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:161
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:477
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.The IsSIMDCombinable_v variable templ...
Definition: IsSIMDCombinable.h:137
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:270
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:269
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.The HasSIMDAdd_v variable template provides...
Definition: HasSIMDAdd.h:187
Constraints on the storage order of matrix types.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:411
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:421
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:173
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:172
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:171
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:401
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:267
Header file for the IsComplexDouble type trait.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
Expression object for transpose dense matrix-dense matrix multiplications.The TDMatDMatMultExpr class...
Definition: Forward.h:167
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:289
Header file for the If class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:433
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:271
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:391
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:465
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:445
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:375
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:284
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:296
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:311
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:170
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:455
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:326
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.The HasSIMDMult_v variable template provid...
Definition: HasSIMDMult.h:188
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:278
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:302
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:281
Header file for the DeclSym functor.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:275
Header file for the IsExpression type trait class.
Header file for the function trace functionality.