35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATDMATMULTEXPR_H_
139template<
typename MT1
146 :
public MatMatMultExpr< DenseMatrix< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
161 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
166 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
170 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
171 static constexpr bool HERM = ( HF && !( LF || UF ) );
172 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
173 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
182 template<
typename T1,
typename T2,
typename T3 >
192 template<
typename T1,
typename T2,
typename T3 >
193 static constexpr bool UseBlasKernel_v =
196 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
197 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
198 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
199 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
200 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
201 IsBLASCompatible_v< ElementType_t<T1> > &&
202 IsBLASCompatible_v< ElementType_t<T2> > &&
203 IsBLASCompatible_v< ElementType_t<T3> > &&
214 template<
typename T1,
typename T2,
typename T3 >
215 static constexpr bool UseVectorizedDefaultKernel_v =
216 ( useOptimizedKernels &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsSIMDCombinable_v< ElementType_t<T1>
290 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
291 MT1::simdEnabled && MT2::simdEnabled &&
292 HasSIMDAdd_v<ET1,ET2> &&
293 HasSIMDMult_v<ET1,ET2> );
330 if( IsDiagonal_v<MT1> ) {
333 else if( IsDiagonal_v<MT2> ) {
336 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
337 const size_t begin( ( IsUpper_v<MT1> )
338 ?( ( IsLower_v<MT2> )
339 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
340 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
341 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
342 :( ( IsLower_v<MT2> )
343 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
345 const size_t end( ( IsLower_v<MT1> )
346 ?( ( IsUpper_v<MT2> )
347 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
348 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
349 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
350 :( ( IsUpper_v<MT2> )
351 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
352 :(
lhs_.columns() ) ) );
376 if( i >=
lhs_.rows() ) {
379 if( j >=
rhs_.columns() ) {
391 inline size_t rows() const noexcept {
402 return rhs_.columns();
432 template<
typename T >
433 inline bool canAlias(
const T* alias )
const noexcept {
434 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
444 template<
typename T >
445 inline bool isAliased(
const T* alias )
const noexcept {
446 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
456 return lhs_.isAligned() &&
rhs_.isAligned();
467 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
469 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
470 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD ) &&
471 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
494 template<
typename MT
503 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
506 else if( rhs.
lhs_.columns() == 0UL ) {
521 TDMatDMatMultExpr::selectAssignKernel( *lhs, A, B );
537 template<
typename MT3
540 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
542 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
543 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
544 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
545 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
546 selectSmallAssignKernel( C, A, B );
548 selectBlasAssignKernel( C, A, B );
567 template<
typename MT3
570 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
571 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
573 const size_t M( A.rows() );
574 const size_t N( B.columns() );
575 const size_t K( A.columns() );
579 for(
size_t i=0UL; i<M; ++i )
581 const size_t kbegin( ( IsUpper_v<MT4> )
582 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
584 const size_t kend( ( IsLower_v<MT4> )
585 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
589 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
590 for(
size_t j=0UL; j<N; ++j ) {
597 const size_t jbegin( ( IsUpper_v<MT5> )
598 ?( ( IsStrictlyUpper_v<MT5> )
599 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
600 :(
UPP ?
max(i,kbegin) : kbegin ) )
601 :(
UPP ? i : 0UL ) );
602 const size_t jend( ( IsLower_v<MT5> )
603 ?( ( IsStrictlyLower_v<MT5> )
604 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
605 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
606 :(
LOW ? i+1UL : N ) );
608 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
609 for(
size_t j=0UL; j<jbegin; ++j ) {
613 else if( IsStrictlyUpper_v<MT5> ) {
616 for(
size_t j=jbegin; j<jend; ++j ) {
617 C(i,j) = A(i,kbegin) * B(kbegin,j);
619 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
620 for(
size_t j=jend; j<N; ++j ) {
624 else if( IsStrictlyLower_v<MT5> ) {
629 for(
size_t k=kbegin+1UL; k<kend; ++k )
631 const size_t jbegin( ( IsUpper_v<MT5> )
632 ?( ( IsStrictlyUpper_v<MT5> )
636 const size_t jend( ( IsLower_v<MT5> )
637 ?( ( IsStrictlyLower_v<MT5> )
638 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
639 :(
LOW ?
min(i+1UL,k) : k ) )
640 :(
LOW ? i+1UL : N ) );
642 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
645 for(
size_t j=jbegin; j<jend; ++j ) {
646 C(i,j) += A(i,k) * B(k,j);
648 if( IsLower_v<MT5> ) {
649 C(i,jend) = A(i,k) * B(k,jend);
655 for(
size_t i=1UL; i<M; ++i ) {
656 for(
size_t j=0UL; j<i; ++j ) {
657 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
679 template<
typename MT3
682 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
683 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
685 const size_t M( A.rows() );
686 const size_t N( B.columns() );
687 const size_t K( A.columns() );
691 for(
size_t j=0UL; j<N; ++j )
693 const size_t kbegin( ( IsLower_v<MT5> )
694 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
696 const size_t kend( ( IsUpper_v<MT5> )
697 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
701 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
702 for(
size_t i=0UL; i<M; ++i ) {
709 const size_t ibegin( ( IsLower_v<MT4> )
710 ?( ( IsStrictlyLower_v<MT4> )
711 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
712 :(
LOW ?
max(j,kbegin) : kbegin ) )
713 :(
LOW ? j : 0UL ) );
714 const size_t iend( ( IsUpper_v<MT4> )
715 ?( ( IsStrictlyUpper_v<MT4> )
716 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
717 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
718 :(
UPP ? j+1UL : M ) );
720 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
721 for(
size_t i=0UL; i<ibegin; ++i ) {
725 else if( IsStrictlyLower_v<MT4> ) {
728 for(
size_t i=ibegin; i<iend; ++i ) {
729 C(i,j) = A(i,kbegin) * B(kbegin,j);
731 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
732 for(
size_t i=iend; i<M; ++i ) {
736 else if( IsStrictlyUpper_v<MT4> ) {
741 for(
size_t k=kbegin+1UL; k<kend; ++k )
743 const size_t ibegin( ( IsLower_v<MT4> )
744 ?( ( IsStrictlyLower_v<MT4> )
748 const size_t iend( ( IsUpper_v<MT4> )
749 ?( ( IsStrictlyUpper_v<MT4> )
750 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
751 :(
UPP ?
min(j+1UL,k) : k ) )
752 :(
UPP ? j+1UL : M ) );
754 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
757 for(
size_t i=ibegin; i<iend; ++i ) {
758 C(i,j) += A(i,k) * B(k,j);
760 if( IsUpper_v<MT4> ) {
761 C(iend,j) = A(iend,k) * B(k,j);
767 for(
size_t j=1UL; j<N; ++j ) {
768 for(
size_t i=0UL; i<j; ++i ) {
769 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
791 template<
typename MT3
794 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797 constexpr size_t block( BLOCK_SIZE );
799 const size_t M( A.rows() );
800 const size_t N( B.columns() );
802 for(
size_t ii=0UL; ii<M; ii+=block ) {
803 const size_t iend(
min( M, ii+block ) );
804 for(
size_t jj=0UL; jj<N; jj+=block ) {
805 const size_t jend(
min( N, jj+block ) );
806 for(
size_t i=ii; i<iend; ++i )
808 const size_t jbegin( ( IsUpper_v<MT4> )
809 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
811 const size_t jpos( ( IsLower_v<MT4> )
812 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
815 if( IsUpper_v<MT4> ) {
816 for(
size_t j=jj; j<jbegin; ++j ) {
820 for(
size_t j=jbegin; j<jpos; ++j ) {
821 C(i,j) = A(i,j) * B(j,j);
823 if( IsLower_v<MT4> ) {
824 for(
size_t j=jpos; j<jend; ++j ) {
849 template<
typename MT3
852 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
853 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
855 const size_t M( A.rows() );
856 const size_t N( B.columns() );
858 for(
size_t j=0UL; j<N; ++j )
860 const size_t ibegin( ( IsLower_v<MT4> )
861 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
863 const size_t iend( ( IsUpper_v<MT4> )
864 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
868 if( IsLower_v<MT4> ) {
869 for(
size_t i=0UL; i<ibegin; ++i ) {
873 for(
size_t i=ibegin; i<iend; ++i ) {
874 C(i,j) = A(i,j) * B(j,j);
876 if( IsUpper_v<MT4> ) {
877 for(
size_t i=iend; i<M; ++i ) {
900 template<
typename MT3
903 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906 const size_t M( A.rows() );
907 const size_t N( B.columns() );
909 for(
size_t i=0UL; i<M; ++i )
911 const size_t jbegin( ( IsUpper_v<MT5> )
912 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
914 const size_t jend( ( IsLower_v<MT5> )
915 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
919 if( IsUpper_v<MT5> ) {
920 for(
size_t j=0UL; j<jbegin; ++j ) {
924 for(
size_t j=jbegin; j<jend; ++j ) {
925 C(i,j) = A(i,i) * B(i,j);
927 if( IsLower_v<MT5> ) {
928 for(
size_t j=jend; j<N; ++j ) {
951 template<
typename MT3
954 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
955 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
957 constexpr size_t block( BLOCK_SIZE );
959 const size_t M( A.rows() );
960 const size_t N( B.columns() );
962 for(
size_t jj=0UL; jj<N; jj+=block ) {
963 const size_t jend(
min( N, jj+block ) );
964 for(
size_t ii=0UL; ii<M; ii+=block ) {
965 const size_t iend(
min( M, ii+block ) );
966 for(
size_t j=jj; j<jend; ++j )
968 const size_t ibegin( ( IsLower_v<MT5> )
969 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
971 const size_t ipos( ( IsUpper_v<MT5> )
972 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
975 if( IsLower_v<MT5> ) {
976 for(
size_t i=ii; i<ibegin; ++i ) {
980 for(
size_t i=ibegin; i<ipos; ++i ) {
981 C(i,j) = A(i,i) * B(i,j);
983 if( IsUpper_v<MT5> ) {
984 for(
size_t i=ipos; i<iend; ++i ) {
1009 template<
typename MT3
1012 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1017 for(
size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1038 template<
typename MT3
1041 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044 selectDefaultAssignKernel( C, A, B );
1064 template<
typename MT3
1067 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1083 if( IsIntegral_v<ElementType> )
1086 for(
size_t i=0UL; i<M; ++i )
1088 const size_t kbegin( ( IsUpper_v<MT4> )
1089 ?( ( IsLower_v<MT5> )
1090 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1091 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1092 :( IsLower_v<MT5> ? j : 0UL ) );
1093 const size_t kend( ( IsLower_v<MT4> )
1094 ?( ( IsUpper_v<MT5> )
1095 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
1096 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1097 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
1104 SIMDType xmm1( a1 * B.load(k,j ) );
1113 for( ++k; k<kend; ++k ) {
1115 xmm1 += a1 * B.load(k,j );
1116 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1117 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1118 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1119 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1120 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
1121 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
1122 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
1125 C.store( i, j , xmm1 );
1127 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1128 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1129 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1130 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
1131 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
1132 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
1137 C.store( i, j ,
zero );
1154 for( ; (i+2UL) <= M; i+=2UL )
1156 const size_t kbegin( ( IsUpper_v<MT4> )
1157 ?( ( IsLower_v<MT5> )
1158 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1159 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1160 :( IsLower_v<MT5> ? j : 0UL ) );
1161 const size_t kend( ( IsLower_v<MT4> )
1162 ?( ( IsUpper_v<MT5> )
1163 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
1164 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1165 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
1189 for( ++k; k<kend; ++k ) {
1190 a1 =
set( A(i ,k) );
1191 a2 =
set( A(i+1UL,k) );
1209 C.store( i , j , xmm1 );
1211 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1212 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1213 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
1214 C.store( i+1UL, j , xmm6 );
1215 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
1216 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
1217 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
1218 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
1223 C.store( i , j ,
zero );
1228 C.store( i+1UL, j ,
zero );
1238 const size_t kbegin( ( IsUpper_v<MT4> )
1239 ?( ( IsLower_v<MT5> )
1240 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1241 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1242 :( IsLower_v<MT5> ? j : 0UL ) );
1243 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
1250 SIMDType xmm1( a1 * B.load(k,j ) );
1256 for( ++k; k<kend; ++k ) {
1258 xmm1 += a1 * B.load(k,j );
1259 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1260 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1261 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1262 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1265 C.store( i, j , xmm1 );
1267 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1268 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1269 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1274 C.store( i, j ,
zero );
1291 for(
size_t jj=j; jj<jjend; ++jj ) {
1292 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1299 for(
size_t jj=j; jj<jjend; ++jj ) {
1305 for( ; (i+2UL) <= iend; i+=2UL )
1307 const size_t kbegin( ( IsUpper_v<MT4> )
1308 ?( ( IsLower_v<MT5> )
1309 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1310 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1311 :( IsLower_v<MT5> ? j : 0UL ) );
1312 const size_t kend( ( IsLower_v<MT4> )
1313 ?( ( IsUpper_v<MT5> )
1314 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1315 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1316 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1337 for( ++k; k<kend; ++k ) {
1338 a1 =
set( A(i ,k) );
1339 a2 =
set( A(i+1UL,k) );
1354 C.store( i , j , xmm1 );
1356 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1357 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1358 C.store( i+1UL, j , xmm5 );
1359 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1360 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1361 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1366 C.store( i , j ,
zero );
1370 C.store( i+1UL, j ,
zero );
1379 const size_t kbegin( ( IsUpper_v<MT4> )
1380 ?( ( IsLower_v<MT5> )
1381 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1382 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1383 :( IsLower_v<MT5> ? j : 0UL ) );
1384 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1391 SIMDType xmm1( a1 * B.load(k,j ) );
1396 for( ++k; k<kend; ++k ) {
1398 xmm1 += a1 * B.load(k,j );
1399 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1400 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1401 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1404 C.store( i, j , xmm1 );
1406 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1407 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1412 C.store( i, j ,
zero );
1424 for(
size_t jj=j; jj<jjend; ++jj ) {
1439 for(
size_t jj=j; jj<jjend; ++jj ) {
1440 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1447 for(
size_t jj=j; jj<jjend; ++jj ) {
1453 for( ; (i+2UL) <= iend; i+=2UL )
1455 const size_t kbegin( ( IsUpper_v<MT4> )
1456 ?( ( IsLower_v<MT5> )
1457 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1458 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1459 :( IsLower_v<MT5> ? j : 0UL ) );
1460 const size_t kend( ( IsLower_v<MT4> )
1461 ?( ( IsUpper_v<MT5> )
1462 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1463 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1464 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1482 for( ++k; k<kend; ++k ) {
1483 a1 =
set( A(i ,k) );
1484 a2 =
set( A(i+1UL,k) );
1496 C.store( i , j , xmm1 );
1498 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1499 C.store( i+1UL, j , xmm4 );
1500 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1501 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1506 C.store( i , j ,
zero );
1509 C.store( i+1UL, j ,
zero );
1517 const size_t kbegin( ( IsUpper_v<MT4> )
1518 ?( ( IsLower_v<MT5> )
1519 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1520 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1521 :( IsLower_v<MT5> ? j : 0UL ) );
1522 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1529 SIMDType xmm1( a1 * B.load(k,j ) );
1533 for( ++k; k<kend; ++k ) {
1535 xmm1 += a1 * B.load(k,j );
1536 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1537 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1540 C.store( i, j , xmm1 );
1542 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1547 C.store( i, j ,
zero );
1558 for(
size_t jj=j; jj<jjend; ++jj ) {
1573 for(
size_t jj=j; jj<jjend; ++jj ) {
1574 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1581 for(
size_t jj=j; jj<jjend; ++jj ) {
1587 for( ; (i+4UL) <= iend; i+=4UL )
1589 const size_t kbegin( ( IsUpper_v<MT4> )
1590 ?( ( IsLower_v<MT5> )
1591 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1592 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1593 :( IsLower_v<MT5> ? j : 0UL ) );
1594 const size_t kend( ( IsLower_v<MT4> )
1595 ?( ( IsUpper_v<MT5> )
1596 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1597 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1598 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1619 for( ++k; k<kend; ++k ) {
1620 a1 =
set( A(i ,k) );
1621 a2 =
set( A(i+1UL,k) );
1622 a3 =
set( A(i+2UL,k) );
1623 a4 =
set( A(i+3UL,k) );
1636 C.store( i , j , xmm1 );
1638 C.store( i+1UL, j , xmm3 );
1639 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1640 C.store( i+2UL, j , xmm5 );
1641 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1642 C.store( i+3UL, j , xmm7 );
1643 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1648 C.store( i , j ,
zero );
1650 C.store( i+1UL, j ,
zero );
1652 C.store( i+2UL, j ,
zero );
1654 C.store( i+3UL, j ,
zero );
1659 for( ; (i+3UL) <= iend; i+=3UL )
1661 const size_t kbegin( ( IsUpper_v<MT4> )
1662 ?( ( IsLower_v<MT5> )
1663 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1664 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1665 :( IsLower_v<MT5> ? j : 0UL ) );
1666 const size_t kend( ( IsLower_v<MT4> )
1667 ?( ( IsUpper_v<MT5> )
1668 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1669 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1670 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1688 for( ++k; k<kend; ++k ) {
1689 a1 =
set( A(i ,k) );
1690 a2 =
set( A(i+1UL,k) );
1691 a3 =
set( A(i+2UL,k) );
1702 C.store( i , j , xmm1 );
1704 C.store( i+1UL, j , xmm3 );
1705 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1706 C.store( i+2UL, j , xmm5 );
1707 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1712 C.store( i , j ,
zero );
1714 C.store( i+1UL, j ,
zero );
1716 C.store( i+2UL, j ,
zero );
1721 for( ; (i+2UL) <= iend; i+=2UL )
1723 const size_t kbegin( ( IsUpper_v<MT4> )
1724 ?( ( IsLower_v<MT5> )
1725 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1726 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1727 :( IsLower_v<MT5> ? j : 0UL ) );
1728 const size_t kend( ( IsLower_v<MT4> )
1729 ?( ( IsUpper_v<MT5> )
1730 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1731 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1732 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1747 for( ++k; k<kend; ++k ) {
1748 a1 =
set( A(i ,k) );
1749 a2 =
set( A(i+1UL,k) );
1758 C.store( i , j , xmm1 );
1760 C.store( i+1UL, j , xmm3 );
1761 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1766 C.store( i , j ,
zero );
1768 C.store( i+1UL, j ,
zero );
1775 const size_t kbegin( ( IsUpper_v<MT4> )
1776 ?( ( IsLower_v<MT5> )
1777 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1778 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1779 :( IsLower_v<MT5> ? j : 0UL ) );
1780 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1787 SIMDType xmm1( a1 * B.load(k,j ) );
1790 for( ++k; k<kend; ++k ) {
1792 xmm1 += a1 * B.load(k,j );
1796 C.store( i, j , xmm1 );
1802 C.store( i, j ,
zero );
1812 for(
size_t jj=j; jj<jjend; ++jj ) {
1827 for(
size_t jj=j; jj<jjend; ++jj ) {
1828 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1835 for(
size_t jj=j; jj<jjend; ++jj ) {
1841 for( ; (i+4UL) <= iend; i+=4UL )
1843 const size_t kbegin( ( IsUpper_v<MT4> )
1844 ?( ( IsLower_v<MT5> )
1845 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1846 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1847 :( IsLower_v<MT5> ? j : 0UL ) );
1848 const size_t kend( ( IsLower_v<MT4> )
1849 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1862 for( ++k; k<kend; ++k ) {
1864 xmm1 +=
set( A(i ,k) ) * b1;
1865 xmm2 +=
set( A(i+1UL,k) ) * b1;
1866 xmm3 +=
set( A(i+2UL,k) ) * b1;
1867 xmm4 +=
set( A(i+3UL,k) ) * b1;
1870 C.store( i , j, xmm1 );
1871 C.store( i+1UL, j, xmm2 );
1872 C.store( i+2UL, j, xmm3 );
1873 C.store( i+3UL, j, xmm4 );
1878 C.store( i , j,
zero );
1879 C.store( i+1UL, j,
zero );
1880 C.store( i+2UL, j,
zero );
1881 C.store( i+3UL, j,
zero );
1885 for( ; (i+3UL) <= iend; i+=3UL )
1887 const size_t kbegin( ( IsUpper_v<MT4> )
1888 ?( ( IsLower_v<MT5> )
1889 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1890 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1891 :( IsLower_v<MT5> ? j : 0UL ) );
1892 const size_t kend( ( IsLower_v<MT4> )
1893 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1905 for( ++k; k<kend; ++k ) {
1907 xmm1 +=
set( A(i ,k) ) * b1;
1908 xmm2 +=
set( A(i+1UL,k) ) * b1;
1909 xmm3 +=
set( A(i+2UL,k) ) * b1;
1912 C.store( i , j, xmm1 );
1913 C.store( i+1UL, j, xmm2 );
1914 C.store( i+2UL, j, xmm3 );
1924 for( ; (i+2UL) <= iend; i+=2UL )
1926 const size_t kbegin( ( IsUpper_v<MT4> )
1927 ?( ( IsLower_v<MT5> )
1928 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1929 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1930 :( IsLower_v<MT5> ? j : 0UL ) );
1931 const size_t kend( ( IsLower_v<MT4> )
1932 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1943 for( ++k; k<kend; ++k ) {
1945 xmm1 +=
set( A(i ,k) ) * b1;
1946 xmm2 +=
set( A(i+1UL,k) ) * b1;
1949 C.store( i , j, xmm1 );
1950 C.store( i+1UL, j, xmm2 );
1955 C.store( i , j,
zero );
1956 C.store( i+1UL, j,
zero );
1962 const size_t kbegin( ( IsUpper_v<MT4> )
1963 ?( ( IsLower_v<MT5> )
1964 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1965 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1966 :( IsLower_v<MT5> ? j : 0UL ) );
1974 for( ++k; k<K; ++k ) {
1975 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1978 C.store( i, j, xmm1 );
1983 C.store( i, j,
zero );
1992 for(
size_t jj=j; jj<jjend; ++jj ) {
1999 for( ; remainder && j<N; ++j )
2005 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
2014 for( ; (i+2UL) <= M; i+=2UL )
2016 const size_t kbegin( ( IsUpper_v<MT4> )
2017 ?( ( IsLower_v<MT5> )
2018 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2019 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2020 :( IsLower_v<MT5> ? j : 0UL ) );
2021 const size_t kend( ( IsLower_v<MT4> )
2022 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2032 for( ++k; k<kend; ++k ) {
2033 value1 += A(i ,k) * B(k,j);
2034 value2 += A(i+1UL,k) * B(k,j);
2038 C(i+1UL,j) = value2;
2043 reset( C(i+1UL,j) );
2049 const size_t kbegin( ( IsUpper_v<MT4> )
2050 ?( ( IsLower_v<MT5> )
2051 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2052 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2053 :( IsLower_v<MT5> ? j : 0UL ) );
2061 for( ++k; k<K; ++k ) {
2062 value += A(i,k) * B(k,j);
2092 template<
typename MT3
2095 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2096 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2098 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2100 const size_t M( A.rows() );
2101 const size_t N( B.columns() );
2102 const size_t K( A.columns() );
2111 if( IsIntegral_v<ElementType> )
2114 for(
size_t j=0UL; j<N; ++j )
2116 const size_t kbegin( ( IsLower_v<MT5> )
2117 ?( ( IsUpper_v<MT4> )
2118 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2119 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2120 :( IsUpper_v<MT4> ? i : 0UL ) );
2121 const size_t kend( ( IsUpper_v<MT5> )
2122 ?( ( IsLower_v<MT4> )
2123 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2124 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2125 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
2132 SIMDType xmm1( A.load(i ,k) * b1 );
2141 for( ++k; k<kend; ++k ) {
2143 xmm1 += A.load(i ,k) * b1;
2144 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2145 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2146 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2147 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2148 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
2149 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
2150 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
2153 C.store( i , j, xmm1 );
2155 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2156 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2157 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2158 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
2159 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
2160 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
2165 C.store( i , j,
zero );
2182 for( ; (j+2UL) <= N; j+=2UL )
2184 const size_t kbegin( ( IsLower_v<MT5> )
2185 ?( ( IsUpper_v<MT4> )
2186 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2187 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2188 :( IsUpper_v<MT4> ? i : 0UL ) );
2189 const size_t kend( ( IsUpper_v<MT5> )
2190 ?( ( IsLower_v<MT4> )
2191 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2192 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2193 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
2217 for( ++k; k<kend; ++k ) {
2223 b1 =
set( B(k,j ) );
2224 b2 =
set( B(k,j+1UL) );
2237 C.store( i , j , xmm1 );
2239 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2240 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2241 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
2242 C.store( i , j+1UL, xmm6 );
2243 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
2244 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
2245 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
2246 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
2251 C.store( i , j ,
zero );
2256 C.store( i , j+1UL,
zero );
2266 const size_t kbegin( ( IsLower_v<MT5> )
2267 ?( ( IsUpper_v<MT4> )
2268 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2269 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2270 :( IsUpper_v<MT4> ? i : 0UL ) );
2271 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
2278 SIMDType xmm1( A.load(i ,k) * b1 );
2284 for( ++k; k<kend; ++k ) {
2286 xmm1 += A.load(i ,k) * b1;
2287 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2288 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2289 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2290 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2293 C.store( i , j, xmm1 );
2295 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2296 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2297 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2302 C.store( i , j,
zero );
2319 for(
size_t ii=i; ii<iiend; ++ii ) {
2320 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2327 for(
size_t ii=i; ii<iiend; ++ii ) {
2333 for( ; (j+2UL) <= jend; j+=2UL )
2335 const size_t kbegin( ( IsLower_v<MT5> )
2336 ?( ( IsUpper_v<MT4> )
2337 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2338 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2339 :( IsUpper_v<MT4> ? i : 0UL ) );
2340 const size_t kend( ( IsUpper_v<MT5> )
2341 ?( ( IsLower_v<MT4> )
2342 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2343 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2344 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
2365 for( ++k; k<kend; ++k ) {
2370 b1 =
set( B(k,j ) );
2371 b2 =
set( B(k,j+1UL) );
2382 C.store( i , j , xmm1 );
2384 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2385 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2386 C.store( i , j+1UL, xmm5 );
2387 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
2388 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
2389 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
2394 C.store( i , j ,
zero );
2398 C.store( i , j+1UL,
zero );
2407 const size_t kbegin( ( IsLower_v<MT5> )
2408 ?( ( IsUpper_v<MT4> )
2409 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2410 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2411 :( IsUpper_v<MT4> ? i : 0UL ) );
2412 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
2419 SIMDType xmm1( A.load(i ,k) * b1 );
2424 for( ++k; k<kend; ++k ) {
2426 xmm1 += A.load(i ,k) * b1;
2427 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2428 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2429 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2432 C.store( i , j, xmm1 );
2434 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2435 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2440 C.store( i , j,
zero );
2452 for(
size_t ii=i; ii<iiend; ++ii ) {
2467 for(
size_t ii=i; ii<iiend; ++ii ) {
2468 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2475 for(
size_t ii=i; ii<iiend; ++ii ) {
2481 for( ; (j+2UL) <= jend; j+=2UL )
2483 const size_t kbegin( ( IsLower_v<MT5> )
2484 ?( ( IsUpper_v<MT4> )
2485 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2486 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2487 :( IsUpper_v<MT4> ? i : 0UL ) );
2488 const size_t kend( ( IsUpper_v<MT5> )
2489 ?( ( IsLower_v<MT4> )
2490 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2491 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2492 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
2510 for( ++k; k<kend; ++k ) {
2514 b1 =
set( B(k,j ) );
2515 b2 =
set( B(k,j+1UL) );
2524 C.store( i , j , xmm1 );
2526 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2527 C.store( i , j+1UL, xmm4 );
2528 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2529 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2534 C.store( i , j ,
zero );
2537 C.store( i , j+1UL,
zero );
2545 const size_t kbegin( ( IsLower_v<MT5> )
2546 ?( ( IsUpper_v<MT4> )
2547 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2548 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2549 :( IsUpper_v<MT4> ? i : 0UL ) );
2550 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2557 SIMDType xmm1( A.load(i ,k) * b1 );
2561 for( ++k; k<kend; ++k ) {
2563 xmm1 += A.load(i ,k) * b1;
2564 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2565 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2568 C.store( i , j, xmm1 );
2570 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2575 C.store( i , j,
zero );
2586 for(
size_t ii=i; ii<iiend; ++ii ) {
2601 for(
size_t ii=i; ii<iiend; ++ii ) {
2602 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2609 for(
size_t ii=i; ii<iiend; ++ii ) {
2615 for( ; (j+4UL) <= jend; j+=4UL )
2617 const size_t kbegin( ( IsLower_v<MT5> )
2618 ?( ( IsUpper_v<MT4> )
2619 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2620 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2621 :( IsUpper_v<MT4> ? i : 0UL ) );
2622 const size_t kend( ( IsUpper_v<MT5> )
2623 ?( ( IsLower_v<MT4> )
2624 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2625 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2626 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2647 for( ++k; k<kend; ++k ) {
2650 b1 =
set( B(k,j ) );
2651 b2 =
set( B(k,j+1UL) );
2652 b3 =
set( B(k,j+2UL) );
2653 b4 =
set( B(k,j+3UL) );
2664 C.store( i , j , xmm1 );
2666 C.store( i , j+1UL, xmm3 );
2667 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2668 C.store( i , j+2UL, xmm5 );
2669 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2670 C.store( i , j+3UL, xmm7 );
2671 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2676 C.store( i , j ,
zero );
2678 C.store( i , j+1UL,
zero );
2680 C.store( i , j+2UL,
zero );
2682 C.store( i , j+3UL,
zero );
2687 for( ; (j+3UL) <= jend; j+=3UL )
2689 const size_t kbegin( ( IsLower_v<MT5> )
2690 ?( ( IsUpper_v<MT4> )
2691 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2692 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2693 :( IsUpper_v<MT4> ? i : 0UL ) );
2694 const size_t kend( ( IsUpper_v<MT5> )
2695 ?( ( IsLower_v<MT4> )
2696 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2697 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2698 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2716 for( ++k; k<kend; ++k ) {
2719 b1 =
set( B(k,j ) );
2720 b2 =
set( B(k,j+1UL) );
2721 b3 =
set( B(k,j+2UL) );
2730 C.store( i , j , xmm1 );
2732 C.store( i , j+1UL, xmm3 );
2733 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2734 C.store( i , j+2UL, xmm5 );
2735 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2740 C.store( i , j ,
zero );
2742 C.store( i , j+1UL,
zero );
2744 C.store( i , j+2UL,
zero );
2749 for( ; (j+2UL) <= jend; j+=2UL )
2751 const size_t kbegin( ( IsLower_v<MT5> )
2752 ?( ( IsUpper_v<MT4> )
2753 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2754 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2755 :( IsUpper_v<MT4> ? i : 0UL ) );
2756 const size_t kend( ( IsUpper_v<MT5> )
2757 ?( ( IsLower_v<MT4> )
2758 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2759 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2760 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2775 for( ++k; k<kend; ++k ) {
2778 b1 =
set( B(k,j ) );
2779 b2 =
set( B(k,j+1UL) );
2786 C.store( i , j , xmm1 );
2788 C.store( i , j+1UL, xmm3 );
2789 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2794 C.store( i , j ,
zero );
2796 C.store( i , j+1UL,
zero );
2803 const size_t kbegin( ( IsLower_v<MT5> )
2804 ?( ( IsUpper_v<MT4> )
2805 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2806 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2807 :( IsUpper_v<MT4> ? i : 0UL ) );
2808 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2815 SIMDType xmm1( A.load(i ,k) * b1 );
2818 for( ++k; k<kend; ++k ) {
2820 xmm1 += A.load(i ,k) * b1;
2824 C.store( i , j, xmm1 );
2830 C.store( i , j,
zero );
2840 for(
size_t ii=i; ii<iiend; ++ii ) {
2855 for(
size_t ii=i; ii<iiend; ++ii ) {
2856 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
2863 for(
size_t ii=i; ii<iiend; ++ii ) {
2869 for( ; (j+4UL) <= jend; j+=4UL )
2871 const size_t kbegin( ( IsLower_v<MT5> )
2872 ?( ( IsUpper_v<MT4> )
2873 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2874 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2875 :( IsUpper_v<MT4> ? i : 0UL ) );
2876 const size_t kend( ( IsUpper_v<MT5> )
2877 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2890 for( ++k; k<kend; ++k ) {
2892 xmm1 += a1 *
set( B(k,j ) );
2893 xmm2 += a1 *
set( B(k,j+1UL) );
2894 xmm3 += a1 *
set( B(k,j+2UL) );
2895 xmm4 += a1 *
set( B(k,j+3UL) );
2898 C.store( i, j , xmm1 );
2899 C.store( i, j+1UL, xmm2 );
2900 C.store( i, j+2UL, xmm3 );
2901 C.store( i, j+3UL, xmm4 );
2906 C.store( i, j ,
zero );
2907 C.store( i, j+1UL,
zero );
2908 C.store( i, j+2UL,
zero );
2909 C.store( i, j+3UL,
zero );
2913 for( ; (j+3UL) <= jend; j+=3UL )
2915 const size_t kbegin( ( IsLower_v<MT5> )
2916 ?( ( IsUpper_v<MT4> )
2917 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2918 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2919 :( IsUpper_v<MT4> ? i : 0UL ) );
2920 const size_t kend( ( IsUpper_v<MT5> )
2921 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2933 for( ++k; k<kend; ++k ) {
2935 xmm1 += a1 *
set( B(k,j ) );
2936 xmm2 += a1 *
set( B(k,j+1UL) );
2937 xmm3 += a1 *
set( B(k,j+2UL) );
2940 C.store( i, j , xmm1 );
2941 C.store( i, j+1UL, xmm2 );
2942 C.store( i, j+2UL, xmm3 );
2947 C.store( i, j ,
zero );
2948 C.store( i, j+1UL,
zero );
2949 C.store( i, j+2UL,
zero );
2953 for( ; (j+2UL) <= jend; j+=2UL )
2955 const size_t kbegin( ( IsLower_v<MT5> )
2956 ?( ( IsUpper_v<MT4> )
2957 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2958 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2959 :( IsUpper_v<MT4> ? i : 0UL ) );
2960 const size_t kend( ( IsUpper_v<MT5> )
2961 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2972 for( ++k; k<kend; ++k ) {
2974 xmm1 += a1 *
set( B(k,j ) );
2975 xmm2 += a1 *
set( B(k,j+1UL) );
2978 C.store( i, j , xmm1 );
2979 C.store( i, j+1UL, xmm2 );
2984 C.store( i, j ,
zero );
2985 C.store( i, j+1UL,
zero );
2991 const size_t kbegin( ( IsLower_v<MT5> )
2992 ?( ( IsUpper_v<MT4> )
2993 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2994 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2995 :( IsUpper_v<MT4> ? i : 0UL ) );
3003 for( ++k; k<K; ++k ) {
3004 xmm1 += A.load(i,k) *
set( B(k,j) );
3007 C.store( i, j, xmm1 );
3012 C.store( i, j,
zero );
3021 for(
size_t ii=i; ii<iiend; ++ii ) {
3028 for( ; remainder && i<M; ++i )
3034 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
3043 for( ; (j+2UL) <= N; j+=2UL )
3045 const size_t kbegin( ( IsLower_v<MT5> )
3046 ?( ( IsUpper_v<MT4> )
3047 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3048 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3049 :( IsUpper_v<MT4> ? i : 0UL ) );
3050 const size_t kend( ( IsUpper_v<MT5> )
3051 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3061 for( ++k; k<kend; ++k ) {
3062 value1 += A(i,k) * B(k,j );
3063 value2 += A(i,k) * B(k,j+1UL);
3067 C(i,j+1UL) = value2;
3072 reset( C(i,j+1UL) );
3078 const size_t kbegin( ( IsLower_v<MT5> )
3079 ?( ( IsUpper_v<MT4> )
3080 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3081 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3082 :( IsUpper_v<MT4> ? i : 0UL ) );
3090 for( ++k; k<K; ++k ) {
3091 value += A(i,k) * B(k,j);
3120 template<
typename MT3
3123 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3124 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3126 selectDefaultAssignKernel( C, A, B );
3146 template<
typename MT3
3149 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3150 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3180 template<
typename MT3
3183 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3184 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3186 selectLargeAssignKernel( C, A, B );
3192#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3206 template<
typename MT3
3209 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3210 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3212 using ET = ElementType_t<MT3>;
3214 if( IsTriangular_v<MT4> ) {
3216 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3218 else if( IsTriangular_v<MT5> ) {
3220 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3223 gemm( C, A, B, ET(1), ET(0) );
3243 template<
typename MT
3245 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
3249 using TmpType = If_t< SO, ResultType, OppositeType >;
3261 const ForwardFunctor fwd;
3263 const TmpType tmp(
serial( rhs ) );
3264 assign( *lhs, fwd( tmp ) );
3282 template<
typename MT
3284 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
3291 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3305 TDMatDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
3321 template<
typename MT3
3324 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3326 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
3327 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
3328 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
3329 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
3330 selectSmallAddAssignKernel( C, A, B );
3332 selectBlasAddAssignKernel( C, A, B );
3351 template<
typename MT3
3354 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3355 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3357 const size_t M( A.rows() );
3358 const size_t N( B.columns() );
3359 const size_t K( A.columns() );
3363 for(
size_t i=0UL; i<M; ++i )
3365 const size_t kbegin( ( IsUpper_v<MT4> )
3366 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3368 const size_t kend( ( IsLower_v<MT4> )
3369 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3373 for(
size_t k=kbegin; k<kend; ++k )
3375 const size_t jbegin( ( IsUpper_v<MT5> )
3376 ?( ( IsStrictlyUpper_v<MT5> )
3377 ?(
UPP ?
max(i,k+1UL) : k+1UL )
3378 :(
UPP ?
max(i,k) : k ) )
3379 :(
UPP ? i : 0UL ) );
3380 const size_t jend( ( IsLower_v<MT5> )
3381 ?( ( IsStrictlyLower_v<MT5> )
3382 ?(
LOW ?
min(i+1UL,k) : k )
3383 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
3384 :(
LOW ? i+1UL : N ) );
3386 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
3389 const size_t jnum( jend - jbegin );
3390 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3393 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3394 C(i,j ) += A(i,k) * B(k,j );
3395 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3398 C(i,jpos) += A(i,k) * B(k,jpos);
3420 template<
typename MT3
3423 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3424 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3426 const size_t M( A.rows() );
3427 const size_t N( B.columns() );
3428 const size_t K( A.columns() );
3432 for(
size_t j=0UL; j<N; ++j )
3434 const size_t kbegin( ( IsLower_v<MT5> )
3435 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3437 const size_t kend( ( IsUpper_v<MT5> )
3438 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3442 for(
size_t k=kbegin; k<kend; ++k )
3444 const size_t ibegin( ( IsLower_v<MT4> )
3445 ?( ( IsStrictlyLower_v<MT4> )
3446 ?(
LOW ?
max(j,k+1UL) : k+1UL )
3447 :(
LOW ?
max(j,k) : k ) )
3448 :(
LOW ? j : 0UL ) );
3449 const size_t iend( ( IsUpper_v<MT4> )
3450 ?( ( IsStrictlyUpper_v<MT4> )
3451 ?(
UPP ?
min(j+1UL,k) : k )
3452 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
3453 :(
UPP ? j+1UL : M ) );
3455 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
3458 const size_t inum( iend - ibegin );
3459 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3462 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3463 C(i ,j) += A(i ,k) * B(k,j);
3464 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3467 C(ipos,j) += A(ipos,k) * B(k,j);
3489 template<
typename MT3
3492 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3493 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3495 constexpr size_t block( BLOCK_SIZE );
3497 const size_t M( A.rows() );
3498 const size_t N( B.columns() );
3500 for(
size_t ii=0UL; ii<M; ii+=block ) {
3501 const size_t iend(
min( M, ii+block ) );
3502 for(
size_t jj=0UL; jj<N; jj+=block ) {
3503 const size_t jend(
min( N, jj+block ) );
3504 for(
size_t i=ii; i<iend; ++i )
3506 const size_t jbegin( ( IsUpper_v<MT4> )
3507 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
3509 const size_t jpos( ( IsLower_v<MT4> )
3510 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
3513 for(
size_t j=jbegin; j<jpos; ++j ) {
3514 C(i,j) += A(i,j) * B(j,j);
3537 template<
typename MT3
3540 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3541 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3543 const size_t M( A.rows() );
3544 const size_t N( B.columns() );
3546 for(
size_t j=0UL; j<N; ++j )
3548 const size_t ibegin( ( IsLower_v<MT4> )
3549 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3551 const size_t iend( ( IsUpper_v<MT4> )
3552 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3556 const size_t inum( iend - ibegin );
3557 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3560 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3561 C(i ,j) += A(i ,j) * B(j,j);
3562 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
3565 C(ipos,j) += A(ipos,j) * B(j,j);
3586 template<
typename MT3
3589 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3590 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3592 const size_t M( A.rows() );
3593 const size_t N( B.columns() );
3595 for(
size_t i=0UL; i<M; ++i )
3597 const size_t jbegin( ( IsUpper_v<MT5> )
3598 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3600 const size_t jend( ( IsLower_v<MT5> )
3601 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3605 const size_t jnum( jend - jbegin );
3606 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3609 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3610 C(i,j ) += A(i,i) * B(i,j );
3611 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
3614 C(i,jpos) += A(i,i) * B(i,jpos);
3635 template<
typename MT3
3638 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3639 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3641 constexpr size_t block( BLOCK_SIZE );
3643 const size_t M( A.rows() );
3644 const size_t N( B.columns() );
3646 for(
size_t jj=0UL; jj<N; jj+=block ) {
3647 const size_t jend(
min( N, jj+block ) );
3648 for(
size_t ii=0UL; ii<M; ii+=block ) {
3649 const size_t iend(
min( M, ii+block ) );
3650 for(
size_t j=jj; j<jend; ++j )
3652 const size_t ibegin( ( IsLower_v<MT5> )
3653 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
3655 const size_t ipos( ( IsUpper_v<MT5> )
3656 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
3659 for(
size_t i=ibegin; i<ipos; ++i ) {
3660 C(i,j) += A(i,i) * B(i,j);
3683 template<
typename MT3
3686 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3687 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3689 for(
size_t i=0UL; i<A.rows(); ++i ) {
3690 C(i,i) += A(i,i) * B(i,i);
3710 template<
typename MT3
3713 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3714 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3716 selectDefaultAddAssignKernel( C, A, B );
3736 template<
typename MT3
3739 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3740 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3742 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3744 const size_t M( A.rows() );
3745 const size_t N( B.columns() );
3746 const size_t K( A.columns() );
3755 if( IsIntegral_v<ElementType> )
3758 for(
size_t i=0UL; i<M; ++i )
3760 const size_t kbegin( ( IsUpper_v<MT4> )
3761 ?( ( IsLower_v<MT5> )
3762 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3763 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3764 :( IsLower_v<MT5> ? j : 0UL ) );
3765 const size_t kend( ( IsLower_v<MT4> )
3766 ?( ( IsUpper_v<MT5> )
3767 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3768 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3769 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3780 for(
size_t k=kbegin; k<kend; ++k ) {
3782 xmm1 += a1 * B.load(k,j );
3783 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3784 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3785 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3786 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3787 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
3788 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
3789 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
3792 C.store( i, j , xmm1 );
3794 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3795 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3796 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3797 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3798 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3799 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3808 for( ; (i+2UL) <= M; i+=2UL )
3810 const size_t kbegin( ( IsUpper_v<MT4> )
3811 ?( ( IsLower_v<MT5> )
3812 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3813 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3814 :( IsLower_v<MT5> ? j : 0UL ) );
3815 const size_t kend( ( IsLower_v<MT4> )
3816 ?( ( IsUpper_v<MT5> )
3817 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3818 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3819 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3826 SIMDType xmm6 ( C.load(i+1UL,j ) );
3832 for(
size_t k=kbegin; k<kend; ++k ) {
3852 C.store( i , j , xmm1 );
3854 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3855 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3856 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3857 C.store( i+1UL, j , xmm6 );
3858 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3859 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3860 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3861 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3866 const size_t kbegin( ( IsUpper_v<MT4> )
3867 ?( ( IsLower_v<MT5> )
3868 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3869 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3870 :( IsLower_v<MT5> ? j : 0UL ) );
3871 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3879 for(
size_t k=kbegin; k<kend; ++k ) {
3881 xmm1 += a1 * B.load(k,j );
3882 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3883 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3884 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3885 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
3888 C.store( i, j , xmm1 );
3890 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3891 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3892 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3900 for( ; (i+2UL) <= M; i+=2UL )
3902 const size_t kbegin( ( IsUpper_v<MT4> )
3903 ?( ( IsLower_v<MT5> )
3904 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3905 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3906 :( IsLower_v<MT5> ? j : 0UL ) );
3907 const size_t kend( ( IsLower_v<MT4> )
3908 ?( ( IsUpper_v<MT5> )
3909 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3910 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3911 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3922 for(
size_t k=kbegin; k<kend; ++k ) {
3939 C.store( i , j , xmm1 );
3941 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3942 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3943 C.store( i+1UL, j , xmm5 );
3944 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3945 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3946 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3951 const size_t kbegin( ( IsUpper_v<MT4> )
3952 ?( ( IsLower_v<MT5> )
3953 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3955 :( IsLower_v<MT5> ? j : 0UL ) );
3956 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3963 for(
size_t k=kbegin; k<kend; ++k ) {
3965 xmm1 += a1 * B.load(k,j );
3966 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
3967 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
3968 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
3971 C.store( i, j , xmm1 );
3973 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3974 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3982 for( ; (i+2UL) <= M; i+=2UL )
3984 const size_t kbegin( ( IsUpper_v<MT4> )
3985 ?( ( IsLower_v<MT5> )
3986 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3987 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3988 :( IsLower_v<MT5> ? j : 0UL ) );
3989 const size_t kend( ( IsLower_v<MT4> )
3990 ?( ( IsUpper_v<MT5> )
3991 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3992 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3993 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
4002 for(
size_t k=kbegin; k<kend; ++k ) {
4016 C.store( i , j , xmm1 );
4018 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
4019 C.store( i+1UL, j , xmm4 );
4020 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
4021 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
4026 const size_t kbegin( ( IsUpper_v<MT4> )
4027 ?( ( IsLower_v<MT5> )
4028 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4029 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4030 :( IsLower_v<MT5> ? j : 0UL ) );
4031 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
4037 for(
size_t k=kbegin; k<kend; ++k ) {
4039 xmm1 += a1 * B.load(k,j );
4040 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
4041 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
4044 C.store( i, j , xmm1 );
4046 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
4053 size_t i(
LOW ? j : 0UL );
4055 for( ; (i+4UL) <= iend; i+=4UL )
4057 const size_t kbegin( ( IsUpper_v<MT4> )
4058 ?( ( IsLower_v<MT5> )
4059 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4060 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4061 :( IsLower_v<MT5> ? j : 0UL ) );
4062 const size_t kend( ( IsLower_v<MT4> )
4063 ?( ( IsUpper_v<MT5> )
4064 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
4065 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
4066 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
4077 for(
size_t k=kbegin; k<kend; ++k ) {
4094 C.store( i , j , xmm1 );
4096 C.store( i+1UL, j , xmm3 );
4097 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
4098 C.store( i+2UL, j , xmm5 );
4099 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
4100 C.store( i+3UL, j , xmm7 );
4101 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
4104 for( ; (i+3UL) <= iend; i+=3UL )
4106 const size_t kbegin( ( IsUpper_v<MT4> )
4107 ?( ( IsLower_v<MT5> )
4108 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4109 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4110 :( IsLower_v<MT5> ? j : 0UL ) );
4111 const size_t kend( ( IsLower_v<MT4> )
4112 ?( ( IsUpper_v<MT5> )
4113 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
4114 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
4115 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
4124 for(
size_t k=kbegin; k<kend; ++k ) {
4138 C.store( i , j , xmm1 );
4140 C.store( i+1UL, j , xmm3 );
4141 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
4142 C.store( i+2UL, j , xmm5 );
4143 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
4146 for( ; (i+2UL) <= iend; i+=2UL )
4148 const size_t kbegin( ( IsUpper_v<MT4> )
4149 ?( ( IsLower_v<MT5> )
4150 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4151 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4152 :( IsLower_v<MT5> ? j : 0UL ) );
4153 const size_t kend( ( IsLower_v<MT4> )
4154 ?( ( IsUpper_v<MT5> )
4155 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
4156 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
4157 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
4164 for(
size_t k=kbegin; k<kend; ++k ) {
4175 C.store( i , j , xmm1 );
4177 C.store( i+1UL, j , xmm3 );
4178 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
4183 const size_t kbegin( ( IsUpper_v<MT4> )
4184 ?( ( IsLower_v<MT5> )
4185 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4186 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4187 :( IsLower_v<MT5> ? j : 0UL ) );
4188 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
4193 for(
size_t k=kbegin; k<kend; ++k ) {
4195 xmm1 += a1 * B.load(k,j );
4199 C.store( i, j , xmm1 );
4207 size_t i(
LOW ? j : 0UL );
4209 for( ; (i+4UL) <= iend; i+=4UL )
4211 const size_t kbegin( ( IsUpper_v<MT4> )
4212 ?( ( IsLower_v<MT5> )
4213 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4214 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4215 :( IsLower_v<MT5> ? j : 0UL ) );
4216 const size_t kend( ( IsLower_v<MT4> )
4217 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
4225 for(
size_t k=kbegin; k<kend; ++k ) {
4227 xmm1 +=
set( A(i ,k) ) * b1;
4228 xmm2 +=
set( A(i+1UL,k) ) * b1;
4229 xmm3 +=
set( A(i+2UL,k) ) * b1;
4230 xmm4 +=
set( A(i+3UL,k) ) * b1;
4233 C.store( i , j, xmm1 );
4234 C.store( i+1UL, j, xmm2 );
4235 C.store( i+2UL, j, xmm3 );
4236 C.store( i+3UL, j, xmm4 );
4239 for( ; (i+3UL) <= iend; i+=3UL )
4241 const size_t kbegin( ( IsUpper_v<MT4> )
4242 ?( ( IsLower_v<MT5> )
4243 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4244 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4245 :( IsLower_v<MT5> ? j : 0UL ) );
4246 const size_t kend( ( IsLower_v<MT4> )
4247 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
4254 for(
size_t k=kbegin; k<kend; ++k ) {
4256 xmm1 +=
set( A(i ,k) ) * b1;
4257 xmm2 +=
set( A(i+1UL,k) ) * b1;
4258 xmm3 +=
set( A(i+2UL,k) ) * b1;
4261 C.store( i , j, xmm1 );
4262 C.store( i+1UL, j, xmm2 );
4263 C.store( i+2UL, j, xmm3 );
4266 for( ; (i+2UL) <= iend; i+=2UL )
4268 const size_t kbegin( ( IsUpper_v<MT4> )
4269 ?( ( IsLower_v<MT5> )
4270 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4271 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4272 :( IsLower_v<MT5> ? j : 0UL ) );
4273 const size_t kend( ( IsLower_v<MT4> )
4274 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4280 for(
size_t k=kbegin; k<kend; ++k ) {
4282 xmm1 +=
set( A(i ,k) ) * b1;
4283 xmm2 +=
set( A(i+1UL,k) ) * b1;
4286 C.store( i , j, xmm1 );
4287 C.store( i+1UL, j, xmm2 );
4292 const size_t kbegin( ( IsUpper_v<MT4> )
4293 ?( ( IsLower_v<MT5> )
4294 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4295 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4296 :( IsLower_v<MT5> ? j : 0UL ) );
4300 for(
size_t k=kbegin; k<K; ++k ) {
4301 xmm1 +=
set( A(i,k) ) * B.load(k,j);
4304 C.store( i, j, xmm1 );
4308 for( ; remainder && j<N; ++j )
4310 const size_t iend(
UPP ? j+1UL : M );
4311 size_t i(
LOW ? j : 0UL );
4313 for( ; (i+2UL) <= iend; i+=2UL )
4315 const size_t kbegin( ( IsUpper_v<MT4> )
4316 ?( ( IsLower_v<MT5> )
4317 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4318 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4319 :( IsLower_v<MT5> ? j : 0UL ) );
4320 const size_t kend( ( IsLower_v<MT4> )
4321 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4327 for(
size_t k=kbegin; k<kend; ++k ) {
4328 value1 += A(i ,k) * B(k,j);
4329 value2 += A(i+1UL,k) * B(k,j);
4333 C(i+1UL,j) = value2;
4338 const size_t kbegin( ( IsUpper_v<MT4> )
4339 ?( ( IsLower_v<MT5> )
4340 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4341 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4342 :( IsLower_v<MT5> ? j : 0UL ) );
4346 for(
size_t k=kbegin; k<K; ++k ) {
4347 value += A(i,k) * B(k,j);
4372 template<
typename MT3
4375 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4376 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4378 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
4380 const size_t M( A.rows() );
4381 const size_t N( B.columns() );
4382 const size_t K( A.columns() );
4391 if( IsIntegral_v<ElementType> )
4394 for(
size_t j=0UL; j<N; ++j )
4396 const size_t kbegin( ( IsLower_v<MT5> )
4397 ?( ( IsUpper_v<MT4> )
4398 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4399 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4400 :( IsUpper_v<MT4> ? i : 0UL ) );
4401 const size_t kend( ( IsUpper_v<MT5> )
4402 ?( ( IsLower_v<MT4> )
4403 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4404 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
4405 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
4416 for(
size_t k=kbegin; k<kend; ++k ) {
4418 xmm1 += A.load(i ,k) * b1;
4419 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4420 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4421 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
4422 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
4423 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
4424 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
4425 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
4428 C.store( i , j, xmm1 );
4430 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4431 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
4432 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
4433 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
4434 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
4435 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
4444 for( ; (j+2UL) <= N; j+=2UL )
4446 const size_t kbegin( ( IsLower_v<MT5> )
4447 ?( ( IsUpper_v<MT4> )
4448 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4449 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4450 :( IsUpper_v<MT4> ? i : 0UL ) );
4451 const size_t kend( ( IsUpper_v<MT5> )
4452 ?( ( IsLower_v<MT4> )
4453 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4454 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4455 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
4462 SIMDType xmm6 ( C.load(i ,j+1UL) );
4468 for(
size_t k=kbegin; k<kend; ++k ) {
4488 C.store( i , j , xmm1 );
4490 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4491 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
4492 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
4493 C.store( i , j+1UL, xmm6 );
4494 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
4495 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
4496 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
4497 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
4502 const size_t kbegin( ( IsLower_v<MT5> )
4503 ?( ( IsUpper_v<MT4> )
4504 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4506 :( IsUpper_v<MT4> ? i : 0UL ) );
4507 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
4515 for(
size_t k=kbegin; k<kend; ++k ) {
4517 xmm1 += A.load(i ,k) * b1;
4518 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4519 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4520 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
4521 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
4524 C.store( i , j, xmm1 );
4526 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4527 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
4528 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
4536 for( ; (j+2UL) <= N; j+=2UL )
4538 const size_t kbegin( ( IsLower_v<MT5> )
4539 ?( ( IsUpper_v<MT4> )
4540 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4541 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4542 :( IsUpper_v<MT4> ? i : 0UL ) );
4543 const size_t kend( ( IsUpper_v<MT5> )
4544 ?( ( IsLower_v<MT4> )
4545 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4546 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4547 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
4558 for(
size_t k=kbegin; k<kend; ++k ) {
4575 C.store( i , j , xmm1 );
4577 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4578 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
4579 C.store( i , j+1UL, xmm5 );
4580 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
4581 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
4582 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
4587 const size_t kbegin( ( IsLower_v<MT5> )
4588 ?( ( IsUpper_v<MT4> )
4589 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4590 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4591 :( IsUpper_v<MT4> ? i : 0UL ) );
4592 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
4599 for(
size_t k=kbegin; k<kend; ++k ) {
4601 xmm1 += A.load(i ,k) * b1;
4602 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4603 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4604 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
4607 C.store( i , j, xmm1 );
4609 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4610 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
4618 for( ; (j+2UL) <= N; j+=2UL )
4620 const size_t kbegin( ( IsLower_v<MT5> )
4621 ?( ( IsUpper_v<MT4> )
4622 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4623 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4624 :( IsUpper_v<MT4> ? i : 0UL ) );
4625 const size_t kend( ( IsUpper_v<MT5> )
4626 ?( ( IsLower_v<MT4> )
4627 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4628 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4629 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
4638 for(
size_t k=kbegin; k<kend; ++k ) {
4652 C.store( i , j , xmm1 );
4654 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
4655 C.store( i , j+1UL, xmm4 );
4656 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
4657 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
4662 const size_t kbegin( ( IsLower_v<MT5> )
4663 ?( ( IsUpper_v<MT4> )
4664 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4665 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4666 :( IsUpper_v<MT4> ? i : 0UL ) );
4667 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
4673 for(
size_t k=kbegin; k<kend; ++k ) {
4675 xmm1 += A.load(i ,k) * b1;
4676 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
4677 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
4680 C.store( i , j, xmm1 );
4682 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
4689 size_t j(
UPP ? i : 0UL );
4691 for( ; (j+4UL) <= jend; j+=4UL )
4693 const size_t kbegin( ( IsLower_v<MT5> )
4694 ?( ( IsUpper_v<MT4> )
4695 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4696 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4697 :( IsUpper_v<MT4> ? i : 0UL ) );
4698 const size_t kend( ( IsUpper_v<MT5> )
4699 ?( ( IsLower_v<MT4> )
4700 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
4701 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
4702 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4713 for(
size_t k=kbegin; k<kend; ++k ) {
4730 C.store( i , j , xmm1 );
4732 C.store( i , j+1UL, xmm3 );
4733 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4734 C.store( i , j+2UL, xmm5 );
4735 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4736 C.store( i , j+3UL, xmm7 );
4737 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
4740 for( ; (j+3UL) <= jend; j+=3UL )
4742 const size_t kbegin( ( IsLower_v<MT5> )
4743 ?( ( IsUpper_v<MT4> )
4744 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4745 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4746 :( IsUpper_v<MT4> ? i : 0UL ) );
4747 const size_t kend( ( IsUpper_v<MT5> )
4748 ?( ( IsLower_v<MT4> )
4749 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
4750 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
4751 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4760 for(
size_t k=kbegin; k<kend; ++k ) {
4774 C.store( i , j , xmm1 );
4776 C.store( i , j+1UL, xmm3 );
4777 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4778 C.store( i , j+2UL, xmm5 );
4779 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4782 for( ; (j+2UL) <= jend; j+=2UL )
4784 const size_t kbegin( ( IsLower_v<MT5> )
4785 ?( ( IsUpper_v<MT4> )
4786 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4787 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4788 :( IsUpper_v<MT4> ? i : 0UL ) );
4789 const size_t kend( ( IsUpper_v<MT5> )
4790 ?( ( IsLower_v<MT4> )
4791 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4792 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4793 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4800 for(
size_t k=kbegin; k<kend; ++k ) {
4811 C.store( i , j , xmm1 );
4813 C.store( i , j+1UL, xmm3 );
4814 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4819 const size_t kbegin( ( IsLower_v<MT5> )
4820 ?( ( IsUpper_v<MT4> )
4821 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4822 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4823 :( IsUpper_v<MT4> ? i : 0UL ) );
4824 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
4829 for(
size_t k=kbegin; k<kend; ++k ) {
4831 xmm1 += A.load(i ,k) * b1;
4835 C.store( i , j, xmm1 );
4843 size_t j(
UPP ? i : 0UL );
4845 for( ; (j+4UL) <= jend; j+=4UL )
4847 const size_t kbegin( ( IsLower_v<MT5> )
4848 ?( ( IsUpper_v<MT4> )
4849 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4850 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4851 :( IsUpper_v<MT4> ? i : 0UL ) );
4852 const size_t kend( ( IsUpper_v<MT5> )
4853 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4861 for(
size_t k=kbegin; k<kend; ++k ) {
4863 xmm1 += a1 *
set( B(k,j ) );
4864 xmm2 += a1 *
set( B(k,j+1UL) );
4865 xmm3 += a1 *
set( B(k,j+2UL) );
4866 xmm4 += a1 *
set( B(k,j+3UL) );
4869 C.store( i, j , xmm1 );
4870 C.store( i, j+1UL, xmm2 );
4871 C.store( i, j+2UL, xmm3 );
4872 C.store( i, j+3UL, xmm4 );
4875 for( ; (j+3UL) <= jend; j+=3UL )
4877 const size_t kbegin( ( IsLower_v<MT5> )
4878 ?( ( IsUpper_v<MT4> )
4879 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4880 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4881 :( IsUpper_v<MT4> ? i : 0UL ) );
4882 const size_t kend( ( IsUpper_v<MT5> )
4883 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4890 for(
size_t k=kbegin; k<kend; ++k ) {
4892 xmm1 += a1 *
set( B(k,j ) );
4893 xmm2 += a1 *
set( B(k,j+1UL) );
4894 xmm3 += a1 *
set( B(k,j+2UL) );
4897 C.store( i, j , xmm1 );
4898 C.store( i, j+1UL, xmm2 );
4899 C.store( i, j+2UL, xmm3 );
4902 for( ; (j+2UL) <= jend; j+=2UL )
4904 const size_t kbegin( ( IsLower_v<MT5> )
4905 ?( ( IsUpper_v<MT4> )
4906 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4907 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4908 :( IsUpper_v<MT4> ? i : 0UL ) );
4909 const size_t kend( ( IsUpper_v<MT5> )
4910 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4916 for(
size_t k=kbegin; k<kend; ++k ) {
4918 xmm1 += a1 *
set( B(k,j ) );
4919 xmm2 += a1 *
set( B(k,j+1UL) );
4922 C.store( i, j , xmm1 );
4923 C.store( i, j+1UL, xmm2 );
4928 const size_t kbegin( ( IsLower_v<MT5> )
4929 ?( ( IsUpper_v<MT4> )
4930 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4931 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4932 :( IsUpper_v<MT4> ? i : 0UL ) );
4936 for(
size_t k=kbegin; k<K; ++k ) {
4937 xmm1 += A.load(i,k) *
set( B(k,j) );
4940 C.store( i, j, xmm1 );
4944 for( ; remainder && i<M; ++i )
4946 const size_t jend(
LOW ? i+1UL : N );
4947 size_t j(
UPP ? i : 0UL );
4949 for( ; (j+2UL) <= jend; j+=2UL )
4951 const size_t kbegin( ( IsLower_v<MT5> )
4952 ?( ( IsUpper_v<MT4> )
4953 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4954 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4955 :( IsUpper_v<MT4> ? i : 0UL ) );
4956 const size_t kend( ( IsUpper_v<MT5> )
4957 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4963 for(
size_t k=kbegin; k<kend; ++k ) {
4964 value1 += A(i,k) * B(k,j );
4965 value2 += A(i,k) * B(k,j+1UL);
4969 C(i,j+1UL) = value2;
4974 const size_t kbegin( ( IsLower_v<MT5> )
4975 ?( ( IsUpper_v<MT4> )
4976 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4977 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4978 :( IsUpper_v<MT4> ? i : 0UL ) );
4982 for(
size_t k=kbegin; k<K; ++k ) {
4983 value += A(i,k) * B(k,j);
5007 template<
typename MT3
5010 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5011 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5013 selectDefaultAddAssignKernel( C, A, B );
5033 template<
typename MT3
5036 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5037 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5063 template<
typename MT3
5066 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5067 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
5069 selectLargeAddAssignKernel( C, A, B );
5075#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
5089 template<
typename MT3
5092 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5093 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
5095 using ET = ElementType_t<MT3>;
5097 if( IsTriangular_v<MT4> ) {
5098 ResultType_t<MT3> tmp(
serial( B ) );
5099 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
5100 addAssign( C, tmp );
5102 else if( IsTriangular_v<MT5> ) {
5103 ResultType_t<MT3> tmp(
serial( A ) );
5104 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
5105 addAssign( C, tmp );
5108 gemm( C, A, B, ET(1), ET(1) );
5132 template<
typename MT
5134 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
5141 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
5155 TDMatDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
5171 template<
typename MT3
5174 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5176 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
5177 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
5178 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
5179 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
5180 selectSmallSubAssignKernel( C, A, B );
5182 selectBlasSubAssignKernel( C, A, B );
5201 template<
typename MT3
5204 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5205 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5207 const size_t M( A.rows() );
5208 const size_t N( B.columns() );
5209 const size_t K( A.columns() );
5213 for(
size_t i=0UL; i<M; ++i )
5215 const size_t kbegin( ( IsUpper_v<MT4> )
5216 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5218 const size_t kend( ( IsLower_v<MT4> )
5219 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5223 for(
size_t k=kbegin; k<kend; ++k )
5225 const size_t jbegin( ( IsUpper_v<MT5> )
5226 ?( ( IsStrictlyUpper_v<MT5> )
5227 ?(
UPP ?
max(i,k+1UL) : k+1UL )
5228 :(
UPP ?
max(i,k) : k ) )
5229 :(
UPP ? i : 0UL ) );
5230 const size_t jend( ( IsLower_v<MT5> )
5231 ?( ( IsStrictlyLower_v<MT5> )
5232 ?(
LOW ?
min(i+1UL,k) : k )
5233 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
5234 :(
LOW ? i+1UL : N ) );
5236 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
5239 const size_t jnum( jend - jbegin );
5240 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
5243 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5244 C(i,j ) -= A(i,k) * B(k,j );
5245 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5248 C(i,jpos) -= A(i,k) * B(k,jpos);
5270 template<
typename MT3
5273 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5274 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5276 const size_t M( A.rows() );
5277 const size_t N( B.columns() );
5278 const size_t K( A.columns() );
5282 for(
size_t j=0UL; j<N; ++j )
5284 const size_t kbegin( ( IsLower_v<MT5> )
5285 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5287 const size_t kend( ( IsUpper_v<MT5> )
5288 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5292 for(
size_t k=kbegin; k<kend; ++k )
5294 const size_t ibegin( ( IsLower_v<MT4> )
5295 ?( ( IsStrictlyLower_v<MT4> )
5296 ?(
LOW ?
max(j,k+1UL) : k+1UL )
5297 :(
LOW ?
max(j,k) : k ) )
5298 :(
LOW ? j : 0UL ) );
5299 const size_t iend( ( IsUpper_v<MT4> )
5300 ?( ( IsStrictlyUpper_v<MT4> )
5301 ?(
UPP ?
min(j+1UL,k) : k )
5302 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
5303 :(
UPP ? j+1UL : M ) );
5305 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
5308 const size_t inum( iend - ibegin );
5309 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
5312 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5313 C(i ,j) -= A(i ,k) * B(k,j);
5314 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5317 C(ipos,j) -= A(ipos,k) * B(k,j);
5339 template<
typename MT3
5342 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5343 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5345 constexpr size_t block( BLOCK_SIZE );
5347 const size_t M( A.rows() );
5348 const size_t N( B.columns() );
5350 for(
size_t ii=0UL; ii<M; ii+=block ) {
5351 const size_t iend(
min( M, ii+block ) );
5352 for(
size_t jj=0UL; jj<N; jj+=block ) {
5353 const size_t jend(
min( N, jj+block ) );
5354 for(
size_t i=ii; i<iend; ++i )
5356 const size_t jbegin( ( IsUpper_v<MT4> )
5357 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
5359 const size_t jpos( ( IsLower_v<MT4> )
5360 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
5363 for(
size_t j=jbegin; j<jpos; ++j ) {
5364 C(i,j) -= A(i,j) * B(j,j);
5387 template<
typename MT3
5390 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5391 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5393 const size_t M( A.rows() );
5394 const size_t N( B.columns() );
5396 for(
size_t j=0UL; j<N; ++j )
5398 const size_t ibegin( ( IsLower_v<MT4> )
5399 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5401 const size_t iend( ( IsUpper_v<MT4> )
5402 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5406 const size_t inum( iend - ibegin );
5407 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
5410 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5411 C(i ,j) -= A(i ,j) * B(j,j);
5412 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
5415 C(ipos,j) -= A(ipos,j) * B(j,j);
5436 template<
typename MT3
5439 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5440 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5442 const size_t M( A.rows() );
5443 const size_t N( B.columns() );
5445 for(
size_t i=0UL; i<M; ++i )
5447 const size_t jbegin( ( IsUpper_v<MT5> )
5448 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5450 const size_t jend( ( IsLower_v<MT5> )
5451 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5455 const size_t jnum( jend - jbegin );
5456 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
5459 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
5460 C(i,j ) -= A(i,i) * B(i,j );
5461 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
5464 C(i,jpos) -= A(i,i) * B(i,jpos);
5485 template<
typename MT3
5488 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5489 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5491 constexpr size_t block( BLOCK_SIZE );
5493 const size_t M( A.rows() );
5494 const size_t N( B.columns() );
5496 for(
size_t jj=0UL; jj<N; jj+=block ) {
5497 const size_t jend(
min( N, jj+block ) );
5498 for(
size_t ii=0UL; ii<M; ii+=block ) {
5499 const size_t iend(
min( M, ii+block ) );
5500 for(
size_t j=jj; j<jend; ++j )
5502 const size_t ibegin( ( IsLower_v<MT5> )
5503 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
5505 const size_t ipos( ( IsUpper_v<MT5> )
5506 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
5509 for(
size_t i=ibegin; i<ipos; ++i ) {
5510 C(i,j) -= A(i,i) * B(i,j);
5533 template<
typename MT3
5536 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5537 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5539 for(
size_t i=0UL; i<A.rows(); ++i ) {
5540 C(i,i) -= A(i,i) * B(i,i);
5560 template<
typename MT3
5563 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5564 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5566 selectDefaultSubAssignKernel( C, A, B );
5586 template<
typename MT3
5589 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5590 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5592 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5594 const size_t M( A.rows() );
5595 const size_t N( B.columns() );
5596 const size_t K( A.columns() );
5605 if( IsIntegral_v<ElementType> )
5608 for(
size_t i=0UL; i<M; ++i )
5610 const size_t kbegin( ( IsUpper_v<MT4> )
5611 ?( ( IsLower_v<MT5> )
5612 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5613 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5614 :( IsLower_v<MT5> ? j : 0UL ) );
5615 const size_t kend( ( IsLower_v<MT4> )
5616 ?( ( IsUpper_v<MT5> )
5617 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5618 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5619 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5630 for(
size_t k=kbegin; k<kend; ++k ) {
5632 xmm1 -= a1 * B.load(k,j );
5633 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5634 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5635 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5636 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5637 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
5638 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
5639 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
5642 C.store( i, j , xmm1 );
5644 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5645 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5646 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5647 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
5648 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
5649 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
5658 for( ; (i+2UL) <= M; i+=2UL )
5660 const size_t kbegin( ( IsUpper_v<MT4> )
5661 ?( ( IsLower_v<MT5> )
5662 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5663 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5664 :( IsLower_v<MT5> ? j : 0UL ) );
5665 const size_t kend( ( IsLower_v<MT4> )
5666 ?( ( IsUpper_v<MT5> )
5667 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5668 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5669 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5676 SIMDType xmm6 ( C.load(i+1UL,j ) );
5682 for(
size_t k=kbegin; k<kend; ++k ) {
5702 C.store( i , j , xmm1 );
5704 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5705 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5706 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
5707 C.store( i+1UL, j , xmm6 );
5708 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
5709 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
5710 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
5711 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
5716 const size_t kbegin( ( IsUpper_v<MT4> )
5717 ?( ( IsLower_v<MT5> )
5718 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5719 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5720 :( IsLower_v<MT5> ? j : 0UL ) );
5721 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5729 for(
size_t k=kbegin; k<kend; ++k ) {
5731 xmm1 -= a1 * B.load(k,j );
5732 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5733 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5734 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5735 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
5738 C.store( i, j , xmm1 );
5740 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5741 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5742 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
5750 for( ; (i+2UL) <= M; i+=2UL )
5752 const size_t kbegin( ( IsUpper_v<MT4> )
5753 ?( ( IsLower_v<MT5> )
5754 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5755 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5756 :( IsLower_v<MT5> ? j : 0UL ) );
5757 const size_t kend( ( IsLower_v<MT4> )
5758 ?( ( IsUpper_v<MT5> )
5759 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5760 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5761 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5772 for(
size_t k=kbegin; k<kend; ++k ) {
5789 C.store( i , j , xmm1 );
5791 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5792 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
5793 C.store( i+1UL, j , xmm5 );
5794 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
5795 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
5796 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
5801 const size_t kbegin( ( IsUpper_v<MT4> )
5802 ?( ( IsLower_v<MT5> )
5803 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5804 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5805 :( IsLower_v<MT5> ? j : 0UL ) );
5806 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5813 for(
size_t k=kbegin; k<kend; ++k ) {
5815 xmm1 -= a1 * B.load(k,j );
5816 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5817 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5818 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
5821 C.store( i, j , xmm1 );
5823 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5824 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
5832 for( ; (i+2UL) <= M; i+=2UL )
5834 const size_t kbegin( ( IsUpper_v<MT4> )
5835 ?( ( IsLower_v<MT5> )
5836 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5837 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5838 :( IsLower_v<MT5> ? j : 0UL ) );
5839 const size_t kend( ( IsLower_v<MT4> )
5840 ?( ( IsUpper_v<MT5> )
5841 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5842 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5843 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5852 for(
size_t k=kbegin; k<kend; ++k ) {
5866 C.store( i , j , xmm1 );
5868 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
5869 C.store( i+1UL, j , xmm4 );
5870 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
5871 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
5876 const size_t kbegin( ( IsUpper_v<MT4> )
5877 ?( ( IsLower_v<MT5> )
5878 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5879 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5880 :( IsLower_v<MT5> ? j : 0UL ) );
5881 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5887 for(
size_t k=kbegin; k<kend; ++k ) {
5889 xmm1 -= a1 * B.load(k,j );
5890 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
5891 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
5894 C.store( i, j , xmm1 );
5896 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
5903 size_t i(
LOW ? j : 0UL );
5905 for( ; (i+4UL) <= iend; i+=4UL )
5907 const size_t kbegin( ( IsUpper_v<MT4> )
5908 ?( ( IsLower_v<MT5> )
5909 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5910 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5911 :( IsLower_v<MT5> ? j : 0UL ) );
5912 const size_t kend( ( IsLower_v<MT4> )
5913 ?( ( IsUpper_v<MT5> )
5914 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
5915 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5916 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5927 for(
size_t k=kbegin; k<kend; ++k ) {
5944 C.store( i , j , xmm1 );
5946 C.store( i+1UL, j , xmm3 );
5947 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5948 C.store( i+2UL, j , xmm5 );
5949 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5950 C.store( i+3UL, j , xmm7 );
5951 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
5954 for( ; (i+3UL) <= iend; i+=3UL )
5956 const size_t kbegin( ( IsUpper_v<MT4> )
5957 ?( ( IsLower_v<MT5> )
5958 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5959 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5960 :( IsLower_v<MT5> ? j : 0UL ) );
5961 const size_t kend( ( IsLower_v<MT4> )
5962 ?( ( IsUpper_v<MT5> )
5963 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
5964 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5965 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5974 for(
size_t k=kbegin; k<kend; ++k ) {
5988 C.store( i , j , xmm1 );
5990 C.store( i+1UL, j , xmm3 );
5991 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
5992 C.store( i+2UL, j , xmm5 );
5993 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
5996 for( ; (i+2UL) <= iend; i+=2UL )
5998 const size_t kbegin( ( IsUpper_v<MT4> )
5999 ?( ( IsLower_v<MT5> )
6000 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6001 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6002 :( IsLower_v<MT5> ? j : 0UL ) );
6003 const size_t kend( ( IsLower_v<MT4> )
6004 ?( ( IsUpper_v<MT5> )
6005 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
6006 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6007 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6014 for(
size_t k=kbegin; k<kend; ++k ) {
6025 C.store( i , j , xmm1 );
6027 C.store( i+1UL, j , xmm3 );
6028 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
6033 const size_t kbegin( ( IsUpper_v<MT4> )
6034 ?( ( IsLower_v<MT5> )
6035 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6036 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6037 :( IsLower_v<MT5> ? j : 0UL ) );
6038 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
6043 for(
size_t k=kbegin; k<kend; ++k ) {
6045 xmm1 -= a1 * B.load(k,j );
6049 C.store( i, j , xmm1 );
6057 size_t i(
LOW ? j : 0UL );
6059 for( ; (i+4UL) <= iend; i+=4UL )
6061 const size_t kbegin( ( IsUpper_v<MT4> )
6062 ?( ( IsLower_v<MT5> )
6063 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6064 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6065 :( IsLower_v<MT5> ? j : 0UL ) );
6066 const size_t kend( ( IsLower_v<MT4> )
6067 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6075 for(
size_t k=kbegin; k<kend; ++k ) {
6077 xmm1 -=
set( A(i ,k) ) * b1;
6078 xmm2 -=
set( A(i+1UL,k) ) * b1;
6079 xmm3 -=
set( A(i+2UL,k) ) * b1;
6080 xmm4 -=
set( A(i+3UL,k) ) * b1;
6083 C.store( i , j, xmm1 );
6084 C.store( i+1UL, j, xmm2 );
6085 C.store( i+2UL, j, xmm3 );
6086 C.store( i+3UL, j, xmm4 );
6089 for( ; (i+3UL) <= iend; i+=3UL )
6091 const size_t kbegin( ( IsUpper_v<MT4> )
6092 ?( ( IsLower_v<MT5> )
6093 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6094 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6095 :( IsLower_v<MT5> ? j : 0UL ) );
6096 const size_t kend( ( IsLower_v<MT4> )
6097 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6104 for(
size_t k=kbegin; k<kend; ++k ) {
6106 xmm1 -=
set( A(i ,k) ) * b1;
6107 xmm2 -=
set( A(i+1UL,k) ) * b1;
6108 xmm3 -=
set( A(i+2UL,k) ) * b1;
6111 C.store( i , j, xmm1 );
6112 C.store( i+1UL, j, xmm2 );
6113 C.store( i+2UL, j, xmm3 );
6116 for( ; (i+2UL) <= iend; i+=2UL )
6118 const size_t kbegin( ( IsUpper_v<MT4> )
6119 ?( ( IsLower_v<MT5> )
6120 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6121 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6122 :( IsLower_v<MT5> ? j : 0UL ) );
6123 const size_t kend( ( IsLower_v<MT4> )
6124 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6130 for(
size_t k=kbegin; k<kend; ++k ) {
6132 xmm1 -=
set( A(i ,k) ) * b1;
6133 xmm2 -=
set( A(i+1UL,k) ) * b1;
6136 C.store( i , j, xmm1 );
6137 C.store( i+1UL, j, xmm2 );
6142 const size_t kbegin( ( IsUpper_v<MT4> )
6143 ?( ( IsLower_v<MT5> )
6144 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6145 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6146 :( IsLower_v<MT5> ? j : 0UL ) );
6150 for(
size_t k=kbegin; k<K; ++k ) {
6151 xmm1 -=
set( A(i,k) ) * B.load(k,j);
6154 C.store( i, j, xmm1 );
6158 for( ; remainder && j<N; ++j )
6160 const size_t iend(
UPP ? j+1UL : M );
6161 size_t i(
LOW ? j : 0UL );
6163 for( ; (i+2UL) <= iend; i+=2UL )
6165 const size_t kbegin( ( IsUpper_v<MT4> )
6166 ?( ( IsLower_v<MT5> )
6167 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6168 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6169 :( IsLower_v<MT5> ? j : 0UL ) );
6170 const size_t kend( ( IsLower_v<MT4> )
6171 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6177 for(
size_t k=kbegin; k<kend; ++k ) {
6178 value1 -= A(i ,k) * B(k,j);
6179 value2 -= A(i+1UL,k) * B(k,j);
6183 C(i+1UL,j) = value2;
6188 const size_t kbegin( ( IsUpper_v<MT4> )
6189 ?( ( IsLower_v<MT5> )
6190 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6191 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6192 :( IsLower_v<MT5> ? j : 0UL ) );
6196 for(
size_t k=kbegin; k<K; ++k ) {
6197 value -= A(i,k) * B(k,j);
6222 template<
typename MT3
6225 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6226 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6228 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6230 const size_t M( A.rows() );
6231 const size_t N( B.columns() );
6232 const size_t K( A.columns() );
6241 if( IsIntegral_v<ElementType> )
6244 for(
size_t j=0UL; j<N; ++j )
6246 const size_t kbegin( ( IsLower_v<MT5> )
6247 ?( ( IsUpper_v<MT4> )
6248 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6249 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6250 :( IsUpper_v<MT4> ? i : 0UL ) );
6251 const size_t kend( ( IsUpper_v<MT5> )
6252 ?( ( IsLower_v<MT4> )
6253 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6254 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6255 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
6266 for(
size_t k=kbegin; k<kend; ++k ) {
6268 xmm1 -= A.load(i ,k) * b1;
6269 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6270 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6271 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
6272 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
6273 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
6274 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
6275 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
6278 C.store( i , j, xmm1 );
6280 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6281 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6282 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
6283 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
6284 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
6285 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
6294 for( ; (j+2UL) <= N; j+=2UL )
6296 const size_t kbegin( ( IsLower_v<MT5> )
6297 ?( ( IsUpper_v<MT4> )
6298 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6299 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6300 :( IsUpper_v<MT4> ? i : 0UL ) );
6301 const size_t kend( ( IsUpper_v<MT5> )
6302 ?( ( IsLower_v<MT4> )
6303 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6304 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6305 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
6312 SIMDType xmm6 ( C.load(i ,j+1UL) );
6318 for(
size_t k=kbegin; k<kend; ++k ) {
6338 C.store( i , j , xmm1 );
6340 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6341 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
6342 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
6343 C.store( i , j+1UL, xmm6 );
6344 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
6345 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
6346 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
6347 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
6352 const size_t kbegin( ( IsLower_v<MT5> )
6353 ?( ( IsUpper_v<MT4> )
6354 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6355 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6356 :( IsUpper_v<MT4> ? i : 0UL ) );
6357 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
6365 for(
size_t k=kbegin; k<kend; ++k ) {
6367 xmm1 -= A.load(i ,k) * b1;
6368 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6369 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6370 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
6371 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
6374 C.store( i , j, xmm1 );
6376 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6377 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6378 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
6386 for( ; (j+2UL) <= N; j+=2UL )
6388 const size_t kbegin( ( IsLower_v<MT5> )
6389 ?( ( IsUpper_v<MT4> )
6390 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6391 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6392 :( IsUpper_v<MT4> ? i : 0UL ) );
6393 const size_t kend( ( IsUpper_v<MT5> )
6394 ?( ( IsLower_v<MT4> )
6395 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6396 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6397 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
6408 for(
size_t k=kbegin; k<kend; ++k ) {
6425 C.store( i , j , xmm1 );
6427 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6428 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
6429 C.store( i , j+1UL, xmm5 );
6430 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
6431 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
6432 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
6437 const size_t kbegin( ( IsLower_v<MT5> )
6438 ?( ( IsUpper_v<MT4> )
6439 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6440 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6441 :( IsUpper_v<MT4> ? i : 0UL ) );
6442 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
6449 for(
size_t k=kbegin; k<kend; ++k ) {
6451 xmm1 -= A.load(i ,k) * b1;
6452 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6453 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6454 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
6457 C.store( i , j, xmm1 );
6459 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6460 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
6468 for( ; (j+2UL) <= N; j+=2UL )
6470 const size_t kbegin( ( IsLower_v<MT5> )
6471 ?( ( IsUpper_v<MT4> )
6472 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6473 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6474 :( IsUpper_v<MT4> ? i : 0UL ) );
6475 const size_t kend( ( IsUpper_v<MT5> )
6476 ?( ( IsLower_v<MT4> )
6477 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6478 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6479 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
6488 for(
size_t k=kbegin; k<kend; ++k ) {
6502 C.store( i , j , xmm1 );
6504 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
6505 C.store( i , j+1UL, xmm4 );
6506 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
6507 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
6512 const size_t kbegin( ( IsLower_v<MT5> )
6513 ?( ( IsUpper_v<MT4> )
6514 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6515 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6516 :( IsUpper_v<MT4> ? i : 0UL ) );
6517 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
6523 for(
size_t k=kbegin; k<kend; ++k ) {
6525 xmm1 -= A.load(i ,k) * b1;
6526 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
6527 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
6530 C.store( i , j, xmm1 );
6532 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
6539 size_t j(
UPP ? i : 0UL );
6541 for( ; (j+4UL) <= jend; j+=4UL )
6543 const size_t kbegin( ( IsLower_v<MT5> )
6544 ?( ( IsUpper_v<MT4> )
6545 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6546 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6547 :( IsUpper_v<MT4> ? i : 0UL ) );
6548 const size_t kend( ( IsUpper_v<MT5> )
6549 ?( ( IsLower_v<MT4> )
6550 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6551 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6552 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6563 for(
size_t k=kbegin; k<kend; ++k ) {
6580 C.store( i , j , xmm1 );
6582 C.store( i , j+1UL, xmm3 );
6583 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6584 C.store( i , j+2UL, xmm5 );
6585 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6586 C.store( i , j+3UL, xmm7 );
6587 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
6590 for( ; (j+3UL) <= jend; j+=3UL )
6592 const size_t kbegin( ( IsLower_v<MT5> )
6593 ?( ( IsUpper_v<MT4> )
6594 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6595 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6596 :( IsUpper_v<MT4> ? i : 0UL ) );
6597 const size_t kend( ( IsUpper_v<MT5> )
6598 ?( ( IsLower_v<MT4> )
6599 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6600 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6601 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6610 for(
size_t k=kbegin; k<kend; ++k ) {
6624 C.store( i , j , xmm1 );
6626 C.store( i , j+1UL, xmm3 );
6627 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6628 C.store( i , j+2UL, xmm5 );
6629 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
6632 for( ; (j+2UL) <= jend; j+=2UL )
6634 const size_t kbegin( ( IsLower_v<MT5> )
6635 ?( ( IsUpper_v<MT4> )
6636 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6637 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6638 :( IsUpper_v<MT4> ? i : 0UL ) );
6639 const size_t kend( ( IsUpper_v<MT5> )
6640 ?( ( IsLower_v<MT4> )
6641 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6642 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6643 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6650 for(
size_t k=kbegin; k<kend; ++k ) {
6661 C.store( i , j , xmm1 );
6663 C.store( i , j+1UL, xmm3 );
6664 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
6669 const size_t kbegin( ( IsLower_v<MT5> )
6670 ?( ( IsUpper_v<MT4> )
6671 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6672 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6673 :( IsUpper_v<MT4> ? i : 0UL ) );
6674 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
6679 for(
size_t k=kbegin; k<kend; ++k ) {
6681 xmm1 -= A.load(i ,k) * b1;
6685 C.store( i , j, xmm1 );
6693 size_t j(
UPP ? i : 0UL );
6695 for( ; (j+4UL) <= jend; j+=4UL )
6697 const size_t kbegin( ( IsLower_v<MT5> )
6698 ?( ( IsUpper_v<MT4> )
6699 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6700 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6701 :( IsUpper_v<MT4> ? i : 0UL ) );
6702 const size_t kend( ( IsUpper_v<MT5> )
6703 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6711 for(
size_t k=kbegin; k<kend; ++k ) {
6713 xmm1 -= a1 *
set( B(k,j ) );
6714 xmm2 -= a1 *
set( B(k,j+1UL) );
6715 xmm3 -= a1 *
set( B(k,j+2UL) );
6716 xmm4 -= a1 *
set( B(k,j+3UL) );
6719 C.store( i, j , xmm1 );
6720 C.store( i, j+1UL, xmm2 );
6721 C.store( i, j+2UL, xmm3 );
6722 C.store( i, j+3UL, xmm4 );
6725 for( ; (j+3UL) <= jend; j+=3UL )
6727 const size_t kbegin( ( IsLower_v<MT5> )
6728 ?( ( IsUpper_v<MT4> )
6729 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6730 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6731 :( IsUpper_v<MT4> ? i : 0UL ) );
6732 const size_t kend( ( IsUpper_v<MT5> )
6733 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6740 for(
size_t k=kbegin; k<kend; ++k ) {
6742 xmm1 -= a1 *
set( B(k,j ) );
6743 xmm2 -= a1 *
set( B(k,j+1UL) );
6744 xmm3 -= a1 *
set( B(k,j+2UL) );
6747 C.store( i, j , xmm1 );
6748 C.store( i, j+1UL, xmm2 );
6749 C.store( i, j+2UL, xmm3 );
6752 for( ; (j+2UL) <= jend; j+=2UL )
6754 const size_t kbegin( ( IsLower_v<MT5> )
6755 ?( ( IsUpper_v<MT4> )
6756 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6757 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6758 :( IsUpper_v<MT4> ? i : 0UL ) );
6759 const size_t kend( ( IsUpper_v<MT5> )
6760 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6766 for(
size_t k=kbegin; k<kend; ++k ) {
6768 xmm1 -= a1 *
set( B(k,j ) );
6769 xmm2 -= a1 *
set( B(k,j+1UL) );
6772 C.store( i, j , xmm1 );
6773 C.store( i, j+1UL, xmm2 );
6778 const size_t kbegin( ( IsLower_v<MT5> )
6779 ?( ( IsUpper_v<MT4> )
6780 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6781 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6782 :( IsUpper_v<MT4> ? i : 0UL ) );
6786 for(
size_t k=kbegin; k<K; ++k ) {
6787 xmm1 -= A.load(i,k) *
set( B(k,j) );
6790 C.store( i, j, xmm1 );
6794 for( ; remainder && i<M; ++i )
6796 const size_t jend(
LOW ? i+1UL : N );
6797 size_t j(
UPP ? i : 0UL );
6799 for( ; (j+2UL) <= jend; j+=2UL )
6801 const size_t kbegin( ( IsLower_v<MT5> )
6802 ?( ( IsUpper_v<MT4> )
6803 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6804 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6805 :( IsUpper_v<MT4> ? i : 0UL ) );
6806 const size_t kend( ( IsUpper_v<MT5> )
6807 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6813 for(
size_t k=kbegin; k<kend; ++k ) {
6814 value1 -= A(i,k) * B(k,j );
6815 value2 -= A(i,k) * B(k,j+1UL);
6819 C(i,j+1UL) = value2;
6824 const size_t kbegin( ( IsLower_v<MT5> )
6825 ?( ( IsUpper_v<MT4> )
6826 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6827 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6828 :( IsUpper_v<MT4> ? i : 0UL ) );
6832 for(
size_t k=kbegin; k<K; ++k ) {
6833 value -= A(i,k) * B(k,j);
6857 template<
typename MT3
6860 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6861 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6863 selectDefaultSubAssignKernel( C, A, B );
6883 template<
typename MT3
6886 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6887 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6913 template<
typename MT3
6916 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6917 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6919 selectLargeSubAssignKernel( C, A, B );
6925#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6939 template<
typename MT3
6942 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6943 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6945 using ET = ElementType_t<MT3>;
6947 if( IsTriangular_v<MT4> ) {
6948 ResultType_t<MT3> tmp(
serial( B ) );
6949 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6950 subAssign( C, tmp );
6952 else if( IsTriangular_v<MT5> ) {
6953 ResultType_t<MT3> tmp(
serial( A ) );
6954 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6955 subAssign( C, tmp );
6958 gemm( C, A, B, ET(-1), ET(1) );
6982 template<
typename MT
6984 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatDMatMultExpr& rhs )
6996 schurAssign( *lhs, tmp );
7029 template<
typename MT
7032 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7039 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
7042 else if( rhs.lhs_.columns() == 0UL ) {
7078 template<
typename MT
7081 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7085 using TmpType = If_t< SO, ResultType, OppositeType >;
7097 const ForwardFunctor fwd;
7099 const TmpType tmp( rhs );
7121 template<
typename MT
7124 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7131 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
7170 template<
typename MT
7173 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7180 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
7216 template<
typename MT
7276template<
typename MT1
7283class DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
7284 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
7285 ,
private Computation
7290 using MMM = TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7292 using RES = ResultType_t<MMM>;
7293 using RT1 = ResultType_t<MT1>;
7294 using RT2 = ResultType_t<MT2>;
7295 using ET1 = ElementType_t<RT1>;
7296 using ET2 = ElementType_t<RT2>;
7297 using CT1 = CompositeType_t<MT1>;
7298 using CT2 = CompositeType_t<MT2>;
7303 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
7308 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
7312 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
7313 static constexpr bool HERM = ( HF && !( LF || UF ) );
7314 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
7315 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
7323 template<
typename T1,
typename T2,
typename T3 >
7324 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
7331 template<
typename T1,
typename T2,
typename T3,
typename T4 >
7332 static constexpr bool UseBlasKernel_v =
7334 !SYM && !HERM && !LOW && !UPP &&
7335 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
7336 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
7337 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
7338 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
7339 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7340 IsBLASCompatible_v< ElementType_t<T1> > &&
7341 IsBLASCompatible_v< ElementType_t<T2> > &&
7342 IsBLASCompatible_v< ElementType_t<T3> > &&
7343 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
7344 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
7345 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
7352 template<
typename T1,
typename T2,
typename T3,
typename T4 >
7353 static constexpr bool UseVectorizedDefaultKernel_v =
7354 ( useOptimizedKernels &&
7355 !( IsDiagonal_v<T2> && IsDiagonal_v<T3> ) &&
7356 !( IsDiagonal_v<T2> && IsColumnMajorMatrix_v<T1> ) &&
7357 !( IsDiagonal_v<T3> && IsRowMajorMatrix_v<T1> ) &&
7358 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
7363 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
7371 using ForwardFunctor =
If_t< HERM
7387 using This = DMatScalarMultExpr<MMM,ST,true>;
7390 using BaseType = MatScalarMultExpr< DenseMatrix<This,true> >;
7394 , DeclHermTrait< MultTrait_t<RES,ST> >
7396 , DeclSymTrait< MultTrait_t<RES,ST> >
7399 , DeclDiagTrait< MultTrait_t<RES,ST> >
7400 , DeclLowTrait< MultTrait_t<RES,ST> > >
7402 , DeclUppTrait< MultTrait_t<RES,ST> >
7403 , MultTrait<RES,ST> > > > >::Type;
7408 using SIMDType = SIMDTrait_t<ElementType>;
7413 using LeftOperand =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
7419 using LT = If_t< evaluateLeft, const RT1, CT1 >;
7422 using RT = If_t< evaluateRight, const RT2, CT2 >;
7428 ( !( IsDiagonal_v<MT1> && IsDiagonal_v<MT2> ) &&
7429 MT1::simdEnabled && MT2::simdEnabled &&
7430 IsSIMDCombinable_v<ET1,ET2,ST> &&
7431 HasSIMDAdd_v<ET1,ET2> &&
7432 HasSIMDMult_v<ET1,ET2> );
7436 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
7482 if( j >=
matrix_.columns() ) {
7485 return (*
this)(i,j);
7494 inline size_t rows()
const {
7504 inline size_t columns()
const {
7535 template<
typename T >
7536 inline bool canAlias(
const T* alias )
const {
7537 return matrix_.canAlias( alias );
7547 template<
typename T >
7548 inline bool isAliased(
const T* alias )
const {
7549 return matrix_.isAliased( alias );
7570 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
7572 (
rows() *
columns() < TDMATDMATMULT_THRESHOLD ) ) &&
7573 (
rows() *
columns() >= SMP_TDMATDMATMULT_THRESHOLD );
7595 template<
typename MT
7604 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7605 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7607 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
7610 else if( left.columns() == 0UL ) {
7625 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
7640 template<
typename MT3
7644 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7646 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
7647 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
7648 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
7649 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
7650 selectSmallAssignKernel( C, A, B, scalar );
7652 selectBlasAssignKernel( C, A, B, scalar );
7670 template<
typename MT3
7674 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7675 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7677 const size_t M( A.rows() );
7678 const size_t N( B.columns() );
7679 const size_t K( A.columns() );
7683 for(
size_t i=0UL; i<M; ++i )
7685 const size_t kbegin( ( IsUpper_v<MT4> )
7686 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7688 const size_t kend( ( IsLower_v<MT4> )
7689 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7693 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
7694 for(
size_t j=0UL; j<N; ++j ) {
7701 const size_t jbegin( ( IsUpper_v<MT5> )
7702 ?( ( IsStrictlyUpper_v<MT5> )
7703 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
7704 :( UPP ?
max(i,kbegin) : kbegin ) )
7705 :( UPP ? i : 0UL ) );
7706 const size_t jend( ( IsLower_v<MT5> )
7707 ?( ( IsStrictlyLower_v<MT5> )
7708 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
7709 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
7710 :( LOW ? i+1UL : N ) );
7712 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7713 for(
size_t j=0UL; j<jbegin; ++j ) {
7717 else if( IsStrictlyUpper_v<MT5> ) {
7720 for(
size_t j=jbegin; j<jend; ++j ) {
7721 C(i,j) = A(i,kbegin) * B(kbegin,j);
7723 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7724 for(
size_t j=jend; j<N; ++j ) {
7728 else if( IsStrictlyLower_v<MT5> ) {
7729 reset( C(i,N-1UL) );
7733 for(
size_t k=kbegin+1UL; k<kend; ++k )
7735 const size_t jbegin( ( IsUpper_v<MT5> )
7736 ?( ( IsStrictlyUpper_v<MT5> )
7737 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
7738 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
7739 :( SYM || HERM || UPP ? i : 0UL ) );
7740 const size_t jend( ( IsLower_v<MT5> )
7741 ?( ( IsStrictlyLower_v<MT5> )
7742 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
7743 :( LOW ?
min(i+1UL,k) : k ) )
7744 :( LOW ? i+1UL : N ) );
7746 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7749 for(
size_t j=jbegin; j<jend; ++j ) {
7750 C(i,j) += A(i,k) * B(k,j);
7752 if( IsLower_v<MT5> ) {
7753 C(i,jend) = A(i,k) * B(k,jend);
7758 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7759 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
7760 :( SYM || HERM || UPP ? i : 0UL ) );
7761 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7762 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
7763 :( LOW ? i+1UL : N ) );
7765 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
7768 for(
size_t j=jbegin; j<jend; ++j ) {
7775 for(
size_t i=1UL; i<M; ++i ) {
7776 for(
size_t j=0UL; j<i; ++j ) {
7777 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7798 template<
typename MT3
7802 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7803 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7805 const size_t M( A.rows() );
7806 const size_t N( B.columns() );
7807 const size_t K( A.columns() );
7811 for(
size_t j=0UL; j<N; ++j )
7813 const size_t kbegin( ( IsLower_v<MT5> )
7814 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7816 const size_t kend( ( IsUpper_v<MT5> )
7817 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7821 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
7822 for(
size_t i=0UL; i<M; ++i ) {
7829 const size_t ibegin( ( IsLower_v<MT4> )
7830 ?( ( IsStrictlyLower_v<MT4> )
7831 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
7832 :( LOW ?
max(j,kbegin) : kbegin ) )
7833 :( LOW ? j : 0UL ) );
7834 const size_t iend( ( IsUpper_v<MT4> )
7835 ?( ( IsStrictlyUpper_v<MT4> )
7836 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
7837 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
7838 :( UPP ? j+1UL : M ) );
7840 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
7841 for(
size_t i=0UL; i<ibegin; ++i ) {
7845 else if( IsStrictlyLower_v<MT4> ) {
7848 for(
size_t i=ibegin; i<iend; ++i ) {
7849 C(i,j) = A(i,kbegin) * B(kbegin,j);
7851 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
7852 for(
size_t i=iend; i<M; ++i ) {
7856 else if( IsStrictlyUpper_v<MT4> ) {
7857 reset( C(M-1UL,j) );
7861 for(
size_t k=kbegin+1UL; k<kend; ++k )
7863 const size_t ibegin( ( IsLower_v<MT4> )
7864 ?( ( IsStrictlyLower_v<MT4> )
7865 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
7866 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
7867 :( SYM || HERM || LOW ? j : 0UL ) );
7868 const size_t iend( ( IsUpper_v<MT4> )
7869 ?( ( IsStrictlyUpper_v<MT4> )
7870 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
7871 :( UPP ?
min(j+1UL,k) : k ) )
7872 :( UPP ? j+1UL : M ) );
7874 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7877 for(
size_t i=ibegin; i<iend; ++i ) {
7878 C(i,j) += A(i,k) * B(k,j);
7880 if( IsUpper_v<MT4> ) {
7881 C(iend,j) = A(iend,k) * B(k,j);
7886 const size_t ibegin( ( ( IsLower_v<MT4> && IsLower_v<MT5> ) )
7887 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
7888 :( SYM || HERM || LOW ? j : 0UL ) );
7889 const size_t iend( ( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) )
7890 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
7891 :( UPP ? j+1UL : M ) );
7893 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
7896 for(
size_t i=ibegin; i<iend; ++i ) {
7903 for(
size_t j=1UL; j<N; ++j ) {
7904 for(
size_t i=0UL; i<j; ++i ) {
7905 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7926 template<
typename MT3
7930 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7931 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7933 constexpr size_t block( BLOCK_SIZE );
7935 const size_t M( A.rows() );
7936 const size_t N( B.columns() );
7938 for(
size_t ii=0UL; ii<M; ii+=block ) {
7939 const size_t iend(
min( M, ii+block ) );
7940 for(
size_t jj=0UL; jj<N; jj+=block ) {
7941 const size_t jend(
min( N, jj+block ) );
7942 for(
size_t i=ii; i<iend; ++i )
7944 const size_t jbegin( ( IsUpper_v<MT4> )
7945 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
7947 const size_t jpos( ( IsLower_v<MT4> )
7948 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
7951 if( IsUpper_v<MT4> ) {
7952 for(
size_t j=jj; j<jbegin; ++j ) {
7956 for(
size_t j=jbegin; j<jpos; ++j ) {
7957 C(i,j) = A(i,j) * B(j,j) * scalar;
7959 if( IsLower_v<MT4> ) {
7960 for(
size_t j=jpos; j<jend; ++j ) {
7984 template<
typename MT3
7988 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7989 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7991 const size_t M( A.rows() );
7992 const size_t N( B.columns() );
7994 for(
size_t j=0UL; j<N; ++j )
7996 const size_t ibegin( ( IsLower_v<MT4> )
7997 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7999 const size_t iend( ( IsUpper_v<MT4> )
8000 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
8004 if( IsLower_v<MT4> ) {
8005 for(
size_t i=0UL; i<ibegin; ++i ) {
8009 for(
size_t i=ibegin; i<iend; ++i ) {
8010 C(i,j) = A(i,j) * B(j,j) * scalar;
8012 if( IsUpper_v<MT4> ) {
8013 for(
size_t i=iend; i<M; ++i ) {
8035 template<
typename MT3
8039 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8040 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8042 const size_t M( A.rows() );
8043 const size_t N( B.columns() );
8045 for(
size_t i=0UL; i<M; ++i )
8047 const size_t jbegin( ( IsUpper_v<MT5> )
8048 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
8050 const size_t jend( ( IsLower_v<MT5> )
8051 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
8055 if( IsUpper_v<MT5> ) {
8056 for(
size_t j=0UL; j<jbegin; ++j ) {
8060 for(
size_t j=jbegin; j<jend; ++j ) {
8061 C(i,j) = A(i,i) * B(i,j) * scalar;
8063 if( IsLower_v<MT5> ) {
8064 for(
size_t j=jend; j<N; ++j ) {
8086 template<
typename MT3
8090 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8091 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8093 constexpr size_t block( BLOCK_SIZE );
8095 const size_t M( A.rows() );
8096 const size_t N( B.columns() );
8098 for(
size_t jj=0UL; jj<N; jj+=block ) {
8099 const size_t jend(
min( N, jj+block ) );
8100 for(
size_t ii=0UL; ii<M; ii+=block ) {
8101 const size_t iend(
min( M, ii+block ) );
8102 for(
size_t j=jj; j<jend; ++j )
8104 const size_t ibegin( ( IsLower_v<MT5> )
8105 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
8107 const size_t ipos( ( IsUpper_v<MT5> )
8108 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
8111 if( IsLower_v<MT5> ) {
8112 for(
size_t i=ii; i<ibegin; ++i ) {
8116 for(
size_t i=ibegin; i<ipos; ++i ) {
8117 C(i,j) = A(i,i) * B(i,j) * scalar;
8119 if( IsUpper_v<MT5> ) {
8120 for(
size_t i=ipos; i<iend; ++i ) {
8144 template<
typename MT3
8148 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8149 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8153 for(
size_t i=0UL; i<A.rows(); ++i ) {
8154 C(i,i) = A(i,i) * B(i,i) * scalar;
8173 template<
typename MT3
8177 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8178 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8180 selectDefaultAssignKernel( C, A, B, scalar );
8199 template<
typename MT3
8203 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8204 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8206 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8208 const size_t M( A.rows() );
8209 const size_t N( B.columns() );
8210 const size_t K( A.columns() );
8217 const SIMDType factor(
set( scalar ) );
8221 if( IsIntegral_v<ElementType> )
8223 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
8224 for(
size_t i=0UL; i<M; ++i )
8226 const size_t kbegin( ( IsUpper_v<MT4> )
8227 ?( ( IsLower_v<MT5> )
8228 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8229 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8230 :( IsLower_v<MT5> ? j : 0UL ) );
8231 const size_t kend( ( IsLower_v<MT4> )
8232 ?( ( IsUpper_v<MT5> )
8233 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
8234 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8235 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
8241 SIMDType a1(
set( A(i,k) ) );
8242 SIMDType xmm1( a1 * B.load(k,j ) );
8243 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8244 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8245 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8246 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
8247 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
8248 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
8249 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
8251 for( ++k; k<kend; ++k ) {
8253 xmm1 += a1 * B.load(k,j );
8254 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8255 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8256 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8257 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8258 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
8259 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
8260 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
8263 C.store( i, j , xmm1 * factor );
8264 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8265 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8266 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8267 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
8268 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
8269 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
8270 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
8274 const SIMDType
zero;
8275 C.store( i, j ,
zero );
8288 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
8292 for( ; (i+2UL) <= M; i+=2UL )
8294 const size_t kbegin( ( IsUpper_v<MT4> )
8295 ?( ( IsLower_v<MT5> )
8296 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8297 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8298 :( IsLower_v<MT5> ? j : 0UL ) );
8299 const size_t kend( ( IsLower_v<MT4> )
8300 ?( ( IsUpper_v<MT5> )
8301 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
8302 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8303 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
8309 SIMDType a1(
set( A(i ,k) ) );
8310 SIMDType a2(
set( A(i+1UL,k) ) );
8311 SIMDType b1( B.load(k,j ) );
8312 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8313 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8314 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8315 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
8316 SIMDType xmm1 ( a1 * b1 );
8317 SIMDType xmm2 ( a1 * b2 );
8318 SIMDType xmm3 ( a1 * b3 );
8319 SIMDType xmm4 ( a1 * b4 );
8320 SIMDType xmm5 ( a1 * b5 );
8321 SIMDType xmm6 ( a2 * b1 );
8322 SIMDType xmm7 ( a2 * b2 );
8323 SIMDType xmm8 ( a2 * b3 );
8324 SIMDType xmm9 ( a2 * b4 );
8325 SIMDType xmm10( a2 * b5 );
8327 for( ++k; k<kend; ++k ) {
8328 a1 =
set( A(i ,k) );
8329 a2 =
set( A(i+1UL,k) );
8347 C.store( i , j , xmm1 * factor );
8348 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8349 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8350 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
8351 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
8352 C.store( i+1UL, j , xmm6 * factor );
8353 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
8354 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
8355 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
8356 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
8360 const SIMDType
zero;
8361 C.store( i , j ,
zero );
8366 C.store( i+1UL, j ,
zero );
8376 const size_t kbegin( ( IsUpper_v<MT4> )
8377 ?( ( IsLower_v<MT5> )
8378 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8379 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8380 :( IsLower_v<MT5> ? j : 0UL ) );
8381 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
8387 SIMDType a1(
set( A(i,k) ) );
8388 SIMDType xmm1( a1 * B.load(k,j ) );
8389 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8390 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8391 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8392 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
8394 for( ++k; k<kend; ++k ) {
8396 xmm1 += a1 * B.load(k,j );
8397 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8398 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8399 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8400 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8403 C.store( i, j , xmm1 * factor );
8404 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8405 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8406 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8407 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
8411 const SIMDType
zero;
8412 C.store( i, j ,
zero );
8423 const size_t iend( UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
8429 for(
size_t jj=j; jj<jjend; ++jj ) {
8430 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8437 for(
size_t jj=j; jj<jjend; ++jj ) {
8443 for( ; (i+2UL) <= iend; i+=2UL )
8445 const size_t kbegin( ( IsUpper_v<MT4> )
8446 ?( ( IsLower_v<MT5> )
8447 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8448 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8449 :( IsLower_v<MT5> ? j : 0UL ) );
8450 const size_t kend( ( IsLower_v<MT4> )
8451 ?( ( IsUpper_v<MT5> )
8452 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
8453 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8454 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
8460 SIMDType a1(
set( A(i ,k) ) );
8461 SIMDType a2(
set( A(i+1UL,k) ) );
8462 SIMDType b1( B.load(k,j ) );
8463 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8464 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8465 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8466 SIMDType xmm1( a1 * b1 );
8467 SIMDType xmm2( a1 * b2 );
8468 SIMDType xmm3( a1 * b3 );
8469 SIMDType xmm4( a1 * b4 );
8470 SIMDType xmm5( a2 * b1 );
8471 SIMDType xmm6( a2 * b2 );
8472 SIMDType xmm7( a2 * b3 );
8473 SIMDType xmm8( a2 * b4 );
8475 for( ++k; k<kend; ++k ) {
8476 a1 =
set( A(i ,k) );
8477 a2 =
set( A(i+1UL,k) );
8492 C.store( i , j , xmm1 * factor );
8493 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8494 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8495 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
8496 C.store( i+1UL, j , xmm5 * factor );
8497 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
8498 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
8499 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
8503 const SIMDType
zero;
8504 C.store( i , j ,
zero );
8508 C.store( i+1UL, j ,
zero );
8517 const size_t kbegin( ( IsUpper_v<MT4> )
8518 ?( ( IsLower_v<MT5> )
8519 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8520 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8521 :( IsLower_v<MT5> ? j : 0UL ) );
8522 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
8528 SIMDType a1(
set( A(i,k) ) );
8529 SIMDType xmm1( a1 * B.load(k,j ) );
8530 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8531 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8532 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8534 for( ++k; k<kend; ++k ) {
8536 xmm1 += a1 * B.load(k,j );
8537 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8538 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8539 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8542 C.store( i, j , xmm1 * factor );
8543 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8544 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8545 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
8549 const SIMDType
zero;
8550 C.store( i, j ,
zero );
8562 for(
size_t jj=j; jj<jjend; ++jj ) {
8571 const size_t iend( UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
8577 for(
size_t jj=j; jj<jjend; ++jj ) {
8578 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8585 for(
size_t jj=j; jj<jjend; ++jj ) {
8591 for( ; (i+2UL) <= iend; i+=2UL )
8593 const size_t kbegin( ( IsUpper_v<MT4> )
8594 ?( ( IsLower_v<MT5> )
8595 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8596 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8597 :( IsLower_v<MT5> ? j : 0UL ) );
8598 const size_t kend( ( IsLower_v<MT4> )
8599 ?( ( IsUpper_v<MT5> )
8600 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
8601 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8602 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
8608 SIMDType a1(
set( A(i ,k) ) );
8609 SIMDType a2(
set( A(i+1UL,k) ) );
8610 SIMDType b1( B.load(k,j ) );
8611 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8612 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8613 SIMDType xmm1( a1 * b1 );
8614 SIMDType xmm2( a1 * b2 );
8615 SIMDType xmm3( a1 * b3 );
8616 SIMDType xmm4( a2 * b1 );
8617 SIMDType xmm5( a2 * b2 );
8618 SIMDType xmm6( a2 * b3 );
8620 for( ++k; k<kend; ++k ) {
8621 a1 =
set( A(i ,k) );
8622 a2 =
set( A(i+1UL,k) );
8634 C.store( i , j , xmm1 * factor );
8635 C.store( i , j+
SIMDSIZE , xmm2 * factor );
8636 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
8637 C.store( i+1UL, j , xmm4 * factor );
8638 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
8639 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
8643 const SIMDType
zero;
8644 C.store( i , j ,
zero );
8647 C.store( i+1UL, j ,
zero );
8655 const size_t kbegin( ( IsUpper_v<MT4> )
8656 ?( ( IsLower_v<MT5> )
8657 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8658 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8659 :( IsLower_v<MT5> ? j : 0UL ) );
8660 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
8666 SIMDType a1(
set( A(i,k) ) );
8667 SIMDType xmm1( a1 * B.load(k,j ) );
8668 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8669 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8671 for( ++k; k<kend; ++k ) {
8673 xmm1 += a1 * B.load(k,j );
8674 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8675 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8678 C.store( i, j , xmm1 * factor );
8679 C.store( i, j+
SIMDSIZE , xmm2 * factor );
8680 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
8684 const SIMDType
zero;
8685 C.store( i, j ,
zero );
8696 for(
size_t jj=j; jj<jjend; ++jj ) {
8705 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
8711 for(
size_t jj=j; jj<jjend; ++jj ) {
8712 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8719 for(
size_t jj=j; jj<jjend; ++jj ) {
8725 for( ; (i+4UL) <= iend; i+=4UL )
8727 const size_t kbegin( ( IsUpper_v<MT4> )
8728 ?( ( IsLower_v<MT5> )
8729 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8730 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8731 :( IsLower_v<MT5> ? j : 0UL ) );
8732 const size_t kend( ( IsLower_v<MT4> )
8733 ?( ( IsUpper_v<MT5> )
8734 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
8735 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8736 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8742 SIMDType a1(
set( A(i ,k) ) );
8743 SIMDType a2(
set( A(i+1UL,k) ) );
8744 SIMDType a3(
set( A(i+2UL,k) ) );
8745 SIMDType a4(
set( A(i+3UL,k) ) );
8746 SIMDType b1( B.load(k,j ) );
8747 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8748 SIMDType xmm1( a1 * b1 );
8749 SIMDType xmm2( a1 * b2 );
8750 SIMDType xmm3( a2 * b1 );
8751 SIMDType xmm4( a2 * b2 );
8752 SIMDType xmm5( a3 * b1 );
8753 SIMDType xmm6( a3 * b2 );
8754 SIMDType xmm7( a4 * b1 );
8755 SIMDType xmm8( a4 * b2 );
8757 for( ++k; k<kend; ++k ) {
8758 a1 =
set( A(i ,k) );
8759 a2 =
set( A(i+1UL,k) );
8760 a3 =
set( A(i+2UL,k) );
8761 a4 =
set( A(i+3UL,k) );
8774 C.store( i , j , xmm1 * factor );
8775 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8776 C.store( i+1UL, j , xmm3 * factor );
8777 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8778 C.store( i+2UL, j , xmm5 * factor );
8779 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8780 C.store( i+3UL, j , xmm7 * factor );
8781 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
8785 const SIMDType
zero;
8786 C.store( i , j ,
zero );
8788 C.store( i+1UL, j ,
zero );
8790 C.store( i+2UL, j ,
zero );
8792 C.store( i+3UL, j ,
zero );
8797 for( ; (i+3UL) <= iend; i+=3UL )
8799 const size_t kbegin( ( IsUpper_v<MT4> )
8800 ?( ( IsLower_v<MT5> )
8801 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8802 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8803 :( IsLower_v<MT5> ? j : 0UL ) );
8804 const size_t kend( ( IsLower_v<MT4> )
8805 ?( ( IsUpper_v<MT5> )
8806 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
8807 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8808 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8814 SIMDType a1(
set( A(i ,k) ) );
8815 SIMDType a2(
set( A(i+1UL,k) ) );
8816 SIMDType a3(
set( A(i+2UL,k) ) );
8817 SIMDType b1( B.load(k,j ) );
8818 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8819 SIMDType xmm1( a1 * b1 );
8820 SIMDType xmm2( a1 * b2 );
8821 SIMDType xmm3( a2 * b1 );
8822 SIMDType xmm4( a2 * b2 );
8823 SIMDType xmm5( a3 * b1 );
8824 SIMDType xmm6( a3 * b2 );
8826 for( ++k; k<kend; ++k ) {
8827 a1 =
set( A(i ,k) );
8828 a2 =
set( A(i+1UL,k) );
8829 a3 =
set( A(i+2UL,k) );
8840 C.store( i , j , xmm1 * factor );
8841 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8842 C.store( i+1UL, j , xmm3 * factor );
8843 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8844 C.store( i+2UL, j , xmm5 * factor );
8845 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
8849 const SIMDType
zero;
8850 C.store( i , j ,
zero );
8852 C.store( i+1UL, j ,
zero );
8854 C.store( i+2UL, j ,
zero );
8859 for( ; (i+2UL) <= iend; i+=2UL )
8861 const size_t kbegin( ( IsUpper_v<MT4> )
8862 ?( ( IsLower_v<MT5> )
8863 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8864 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8865 :( IsLower_v<MT5> ? j : 0UL ) );
8866 const size_t kend( ( IsLower_v<MT4> )
8867 ?( ( IsUpper_v<MT5> )
8868 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
8869 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8870 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8876 SIMDType a1(
set( A(i ,k) ) );
8877 SIMDType a2(
set( A(i+1UL,k) ) );
8878 SIMDType b1( B.load(k,j ) );
8879 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8880 SIMDType xmm1( a1 * b1 );
8881 SIMDType xmm2( a1 * b2 );
8882 SIMDType xmm3( a2 * b1 );
8883 SIMDType xmm4( a2 * b2 );
8885 for( ++k; k<kend; ++k ) {
8886 a1 =
set( A(i ,k) );
8887 a2 =
set( A(i+1UL,k) );
8896 C.store( i , j , xmm1 * factor );
8897 C.store( i , j+
SIMDSIZE, xmm2 * factor );
8898 C.store( i+1UL, j , xmm3 * factor );
8899 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
8903 const SIMDType
zero;
8904 C.store( i , j ,
zero );
8906 C.store( i+1UL, j ,
zero );
8913 const size_t kbegin( ( IsUpper_v<MT4> )
8914 ?( ( IsLower_v<MT5> )
8915 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8916 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8917 :( IsLower_v<MT5> ? j : 0UL ) );
8918 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8924 SIMDType a1(
set( A(i,k) ) );
8925 SIMDType xmm1( a1 * B.load(k,j ) );
8926 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
8928 for( ++k; k<kend; ++k ) {
8930 xmm1 += a1 * B.load(k,j );
8934 C.store( i, j , xmm1 * factor );
8935 C.store( i, j+
SIMDSIZE, xmm2 * factor );
8939 const SIMDType
zero;
8940 C.store( i, j ,
zero );
8950 for(
size_t jj=j; jj<jjend; ++jj ) {
8965 for(
size_t jj=j; jj<jjend; ++jj ) {
8966 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
8973 for(
size_t jj=j; jj<jjend; ++jj ) {
8979 for( ; (i+4UL) <= iend; i+=4UL )
8981 const size_t kbegin( ( IsUpper_v<MT4> )
8982 ?( ( IsLower_v<MT5> )
8983 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8984 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8985 :( IsLower_v<MT5> ? j : 0UL ) );
8986 const size_t kend( ( IsLower_v<MT4> )
8987 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8994 SIMDType b1( B.load(k,j) );
8995 SIMDType xmm1(
set( A(i ,k) ) * b1 );
8996 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
8997 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
8998 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
9000 for( ++k; k<kend; ++k ) {
9002 xmm1 +=
set( A(i ,k) ) * b1;
9003 xmm2 +=
set( A(i+1UL,k) ) * b1;
9004 xmm3 +=
set( A(i+2UL,k) ) * b1;
9005 xmm4 +=
set( A(i+3UL,k) ) * b1;
9008 C.store( i , j, xmm1 * factor );
9009 C.store( i+1UL, j, xmm2 * factor );
9010 C.store( i+2UL, j, xmm3 * factor );
9011 C.store( i+3UL, j, xmm4 * factor );
9015 const SIMDType
zero;
9016 C.store( i , j,
zero );
9017 C.store( i+1UL, j,
zero );
9018 C.store( i+2UL, j,
zero );
9019 C.store( i+3UL, j,
zero );
9023 for( ; (i+3UL) <= iend; i+=3UL )
9025 const size_t kbegin( ( IsUpper_v<MT4> )
9026 ?( ( IsLower_v<MT5> )
9027 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9028 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9029 :( IsLower_v<MT5> ? j : 0UL ) );
9030 const size_t kend( ( IsLower_v<MT4> )
9031 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
9038 SIMDType b1( B.load(k,j) );
9039 SIMDType xmm1(
set( A(i ,k) ) * b1 );
9040 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
9041 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
9043 for( ++k; k<kend; ++k ) {
9045 xmm1 +=
set( A(i ,k) ) * b1;
9046 xmm2 +=
set( A(i+1UL,k) ) * b1;
9047 xmm3 +=
set( A(i+2UL,k) ) * b1;
9050 C.store( i , j, xmm1 * factor );
9051 C.store( i+1UL, j, xmm2 * factor );
9052 C.store( i+2UL, j, xmm3 * factor );
9056 const SIMDType
zero;
9057 C.store( i , j,
zero );
9058 C.store( i+1UL, j,
zero );
9059 C.store( i+2UL, j,
zero );
9063 for( ; (i+2UL) <= iend; i+=2UL )
9065 const size_t kbegin( ( IsUpper_v<MT4> )
9066 ?( ( IsLower_v<MT5> )
9067 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9068 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9069 :( IsLower_v<MT5> ? j : 0UL ) );
9070 const size_t kend( ( IsLower_v<MT4> )
9071 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
9078 SIMDType b1( B.load(k,j) );
9079 SIMDType xmm1(
set( A(i ,k) ) * b1 );
9080 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
9082 for( ++k; k<kend; ++k ) {
9084 xmm1 +=
set( A(i ,k) ) * b1;
9085 xmm2 +=
set( A(i+1UL,k) ) * b1;
9088 C.store( i , j, xmm1 * factor );
9089 C.store( i+1UL, j, xmm2 * factor );
9093 const SIMDType
zero;
9094 C.store( i , j,
zero );
9095 C.store( i+1UL, j,
zero );
9101 const size_t kbegin( ( IsUpper_v<MT4> )
9102 ?( ( IsLower_v<MT5> )
9103 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9104 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9105 :( IsLower_v<MT5> ? j : 0UL ) );
9111 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
9113 for( ++k; k<K; ++k ) {
9114 xmm1 +=
set( A(i,k) ) * B.load(k,j);
9117 C.store( i, j, xmm1 * factor );
9121 const SIMDType
zero;
9122 C.store( i, j,
zero );
9131 for(
size_t jj=j; jj<jjend; ++jj ) {
9138 for( ; remainder && j<N; ++j )
9144 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
9153 for( ; (i+2UL) <= M; i+=2UL )
9155 const size_t kbegin( ( IsUpper_v<MT4> )
9156 ?( ( IsLower_v<MT5> )
9157 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9158 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9159 :( IsLower_v<MT5> ? j : 0UL ) );
9160 const size_t kend( ( IsLower_v<MT4> )
9161 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
9171 for( ++k; k<kend; ++k ) {
9172 value1 += A(i ,k) * B(k,j);
9173 value2 += A(i+1UL,k) * B(k,j);
9176 C(i ,j) = value1 * scalar;
9177 C(i+1UL,j) = value2 * scalar;
9182 reset( C(i+1UL,j) );
9188 const size_t kbegin( ( IsUpper_v<MT4> )
9189 ?( ( IsLower_v<MT5> )
9190 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
9191 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
9192 :( IsLower_v<MT5> ? j : 0UL ) );
9200 for( ++k; k<K; ++k ) {
9201 value += A(i,k) * B(k,j);
9204 C(i,j) = value * scalar;
9230 template<
typename MT3
9234 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9235 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9237 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
9239 const size_t M( A.rows() );
9240 const size_t N( B.columns() );
9241 const size_t K( A.columns() );
9248 const SIMDType factor(
set( scalar ) );
9252 if( IsIntegral_v<ElementType> )
9254 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
9255 for(
size_t j=0UL; j<N; ++j )
9257 const size_t kbegin( ( IsLower_v<MT5> )
9258 ?( ( IsUpper_v<MT4> )
9259 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9260 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9261 :( IsUpper_v<MT4> ? i : 0UL ) );
9262 const size_t kend( ( IsUpper_v<MT5> )
9263 ?( ( IsLower_v<MT4> )
9264 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
9265 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
9266 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
9272 SIMDType b1(
set( B(k,j) ) );
9273 SIMDType xmm1( A.load(i ,k) * b1 );
9274 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
9275 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
9276 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
9277 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
9278 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
9279 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
9280 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
9282 for( ++k; k<kend; ++k ) {
9284 xmm1 += A.load(i ,k) * b1;
9285 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
9286 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
9287 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
9288 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
9289 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
9290 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
9291 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
9294 C.store( i , j, xmm1 * factor );
9295 C.store( i+
SIMDSIZE , j, xmm2 * factor );
9296 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
9297 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
9298 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
9299 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
9300 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
9301 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
9305 const SIMDType
zero;
9306 C.store( i , j,
zero );
9319 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
9323 for( ; (j+2UL) <= N; j+=2UL )
9325 const size_t kbegin( ( IsLower_v<MT5> )
9326 ?( ( IsUpper_v<MT4> )
9327 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9328 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9329 :( IsUpper_v<MT4> ? i : 0UL ) );
9330 const size_t kend( ( IsUpper_v<MT5> )
9331 ?( ( IsLower_v<MT4> )
9332 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9333 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9334 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
9340 SIMDType a1( A.load(i ,k) );
9341 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
9342 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
9343 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
9344 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
9345 SIMDType b1(
set( B(k,j ) ) );
9346 SIMDType b2(
set( B(k,j+1UL) ) );
9347 SIMDType xmm1 ( a1 * b1 );
9348 SIMDType xmm2 ( a2 * b1 );
9349 SIMDType xmm3 ( a3 * b1 );
9350 SIMDType xmm4 ( a4 * b1 );
9351 SIMDType xmm5 ( a5 * b1 );
9352 SIMDType xmm6 ( a1 * b2 );
9353 SIMDType xmm7 ( a2 * b2 );
9354 SIMDType xmm8 ( a3 * b2 );
9355 SIMDType xmm9 ( a4 * b2 );
9356 SIMDType xmm10( a5 * b2 );
9358 for( ++k; k<kend; ++k ) {
9364 b1 =
set( B(k,j ) );
9365 b2 =
set( B(k,j+1UL) );
9378 C.store( i , j , xmm1 * factor );
9379 C.store( i+
SIMDSIZE , j , xmm2 * factor );
9380 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
9381 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
9382 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
9383 C.store( i , j+1UL, xmm6 * factor );
9384 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
9385 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
9386 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
9387 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
9391 const SIMDType
zero;
9392 C.store( i , j ,
zero );
9397 C.store( i , j+1UL,
zero );
9407 const size_t kbegin( ( IsLower_v<MT5> )
9408 ?( ( IsUpper_v<MT4> )
9409 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9410 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9411 :( IsUpper_v<MT4> ? i : 0UL ) );
9412 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
9418 SIMDType b1(
set( B(k,j) ) );
9419 SIMDType xmm1( A.load(i ,k) * b1 );
9420 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
9421 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
9422 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
9423 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
9425 for( ++k; k<kend; ++k ) {
9427 xmm1 += A.load(i ,k) * b1;
9428 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
9429 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
9430 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
9431 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
9434 C.store( i , j, xmm1 * factor );
9435 C.store( i+
SIMDSIZE , j, xmm2 * factor );
9436 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
9437 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
9438 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
9442 const SIMDType
zero;
9443 C.store( i , j,
zero );
9454 const size_t jend( LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
9460 for(
size_t ii=i; ii<iiend; ++ii ) {
9461 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9468 for(
size_t ii=i; ii<iiend; ++ii ) {
9474 for( ; (j+2UL) <= jend; j+=2UL )
9476 const size_t kbegin( ( IsLower_v<MT5> )
9477 ?( ( IsUpper_v<MT4> )
9478 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9479 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9480 :( IsUpper_v<MT4> ? i : 0UL ) );
9481 const size_t kend( ( IsUpper_v<MT5> )
9482 ?( ( IsLower_v<MT4> )
9483 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9484 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9485 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
9491 SIMDType a1( A.load(i ,k) );
9492 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
9493 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
9494 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
9495 SIMDType b1(
set( B(k,j ) ) );
9496 SIMDType b2(
set( B(k,j+1UL) ) );
9497 SIMDType xmm1( a1 * b1 );
9498 SIMDType xmm2( a2 * b1 );
9499 SIMDType xmm3( a3 * b1 );
9500 SIMDType xmm4( a4 * b1 );
9501 SIMDType xmm5( a1 * b2 );
9502 SIMDType xmm6( a2 * b2 );
9503 SIMDType xmm7( a3 * b2 );
9504 SIMDType xmm8( a4 * b2 );
9506 for( ++k; k<kend; ++k ) {
9511 b1 =
set( B(k,j ) );
9512 b2 =
set( B(k,j+1UL) );
9523 C.store( i , j , xmm1 * factor );
9524 C.store( i+
SIMDSIZE , j , xmm2 * factor );
9525 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
9526 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
9527 C.store( i , j+1UL, xmm5 * factor );
9528 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
9529 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
9530 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
9534 const SIMDType
zero;
9535 C.store( i , j ,
zero );
9539 C.store( i , j+1UL,
zero );
9548 const size_t kbegin( ( IsLower_v<MT5> )
9549 ?( ( IsUpper_v<MT4> )
9550 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9551 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9552 :( IsUpper_v<MT4> ? i : 0UL ) );
9553 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
9559 SIMDType b1(
set( B(k,j) ) );
9560 SIMDType xmm1( A.load(i ,k) * b1 );
9561 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
9562 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
9563 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
9565 for( ++k; k<kend; ++k ) {
9567 xmm1 += A.load(i ,k) * b1;
9568 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
9569 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
9570 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
9573 C.store( i , j, xmm1 * factor );
9574 C.store( i+
SIMDSIZE , j, xmm2 * factor );
9575 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
9576 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
9580 const SIMDType
zero;
9581 C.store( i , j,
zero );
9593 for(
size_t ii=i; ii<iiend; ++ii ) {
9602 const size_t jend( LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
9608 for(
size_t ii=i; ii<iiend; ++ii ) {
9609 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9616 for(
size_t ii=i; ii<iiend; ++ii ) {
9622 for( ; (j+2UL) <= jend; j+=2UL )
9624 const size_t kbegin( ( IsLower_v<MT5> )
9625 ?( ( IsUpper_v<MT4> )
9626 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9627 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9628 :( IsUpper_v<MT4> ? i : 0UL ) );
9629 const size_t kend( ( IsUpper_v<MT5> )
9630 ?( ( IsLower_v<MT4> )
9631 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9632 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9633 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
9639 SIMDType a1( A.load(i ,k) );
9640 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
9641 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
9642 SIMDType b1(
set( B(k,j ) ) );
9643 SIMDType b2(
set( B(k,j+1UL) ) );
9644 SIMDType xmm1( a1 * b1 );
9645 SIMDType xmm2( a2 * b1 );
9646 SIMDType xmm3( a3 * b1 );
9647 SIMDType xmm4( a1 * b2 );
9648 SIMDType xmm5( a2 * b2 );
9649 SIMDType xmm6( a3 * b2 );
9651 for( ++k; k<kend; ++k ) {
9655 b1 =
set( B(k,j ) );
9656 b2 =
set( B(k,j+1UL) );
9665 C.store( i , j , xmm1 * factor );
9666 C.store( i+
SIMDSIZE , j , xmm2 * factor );
9667 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
9668 C.store( i , j+1UL, xmm4 * factor );
9669 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
9670 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
9674 const SIMDType
zero;
9675 C.store( i , j ,
zero );
9678 C.store( i , j+1UL,
zero );
9686 const size_t kbegin( ( IsLower_v<MT5> )
9687 ?( ( IsUpper_v<MT4> )
9688 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9689 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9690 :( IsUpper_v<MT4> ? i : 0UL ) );
9691 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
9697 SIMDType b1(
set( B(k,j) ) );
9698 SIMDType xmm1( A.load(i ,k) * b1 );
9699 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
9700 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
9702 for( ++k; k<kend; ++k ) {
9704 xmm1 += A.load(i ,k) * b1;
9705 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
9706 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
9709 C.store( i , j, xmm1 * factor );
9710 C.store( i+
SIMDSIZE , j, xmm2 * factor );
9711 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
9715 const SIMDType
zero;
9716 C.store( i , j,
zero );
9727 for(
size_t ii=i; ii<iiend; ++ii ) {
9736 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
9742 for(
size_t ii=i; ii<iiend; ++ii ) {
9743 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
9750 for(
size_t ii=i; ii<iiend; ++ii ) {
9756 for( ; (j+4UL) <= jend; j+=4UL )
9758 const size_t kbegin( ( IsLower_v<MT5> )
9759 ?( ( IsUpper_v<MT4> )
9760 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9761 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9762 :( IsUpper_v<MT4> ? i : 0UL ) );
9763 const size_t kend( ( IsUpper_v<MT5> )
9764 ?( ( IsLower_v<MT4> )
9765 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
9766 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
9767 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9773 SIMDType a1( A.load(i ,k) );
9774 SIMDType a2( A.load(i+
SIMDSIZE,k) );
9775 SIMDType b1(
set( B(k,j ) ) );
9776 SIMDType b2(
set( B(k,j+1UL) ) );
9777 SIMDType b3(
set( B(k,j+2UL) ) );
9778 SIMDType b4(
set( B(k,j+3UL) ) );
9779 SIMDType xmm1( a1 * b1 );
9780 SIMDType xmm2( a2 * b1 );
9781 SIMDType xmm3( a1 * b2 );
9782 SIMDType xmm4( a2 * b2 );
9783 SIMDType xmm5( a1 * b3 );
9784 SIMDType xmm6( a2 * b3 );
9785 SIMDType xmm7( a1 * b4 );
9786 SIMDType xmm8( a2 * b4 );
9788 for( ++k; k<kend; ++k ) {
9791 b1 =
set( B(k,j ) );
9792 b2 =
set( B(k,j+1UL) );
9793 b3 =
set( B(k,j+2UL) );
9794 b4 =
set( B(k,j+3UL) );
9805 C.store( i , j , xmm1 * factor );
9806 C.store( i+
SIMDSIZE, j , xmm2 * factor );
9807 C.store( i , j+1UL, xmm3 * factor );
9808 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
9809 C.store( i , j+2UL, xmm5 * factor );
9810 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
9811 C.store( i , j+3UL, xmm7 * factor );
9812 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
9816 const SIMDType
zero;
9817 C.store( i , j ,
zero );
9819 C.store( i , j+1UL,
zero );
9821 C.store( i , j+2UL,
zero );
9823 C.store( i , j+3UL,
zero );
9828 for( ; (j+3UL) <= jend; j+=3UL )
9830 const size_t kbegin( ( IsLower_v<MT5> )
9831 ?( ( IsUpper_v<MT4> )
9832 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9833 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9834 :( IsUpper_v<MT4> ? i : 0UL ) );
9835 const size_t kend( ( IsUpper_v<MT5> )
9836 ?( ( IsLower_v<MT4> )
9837 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
9838 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
9839 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9845 SIMDType a1( A.load(i ,k) );
9846 SIMDType a2( A.load(i+
SIMDSIZE,k) );
9847 SIMDType b1(
set( B(k,j ) ) );
9848 SIMDType b2(
set( B(k,j+1UL) ) );
9849 SIMDType b3(
set( B(k,j+2UL) ) );
9850 SIMDType xmm1( a1 * b1 );
9851 SIMDType xmm2( a2 * b1 );
9852 SIMDType xmm3( a1 * b2 );
9853 SIMDType xmm4( a2 * b2 );
9854 SIMDType xmm5( a1 * b3 );
9855 SIMDType xmm6( a2 * b3 );
9857 for( ++k; k<kend; ++k ) {
9860 b1 =
set( B(k,j ) );
9861 b2 =
set( B(k,j+1UL) );
9862 b3 =
set( B(k,j+2UL) );
9871 C.store( i , j , xmm1 * factor );
9872 C.store( i+
SIMDSIZE, j , xmm2 * factor );
9873 C.store( i , j+1UL, xmm3 * factor );
9874 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
9875 C.store( i , j+2UL, xmm5 * factor );
9876 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
9880 const SIMDType
zero;
9881 C.store( i , j ,
zero );
9883 C.store( i , j+1UL,
zero );
9885 C.store( i , j+2UL,
zero );
9890 for( ; (j+2UL) <= jend; j+=2UL )
9892 const size_t kbegin( ( IsLower_v<MT5> )
9893 ?( ( IsUpper_v<MT4> )
9894 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9895 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9896 :( IsUpper_v<MT4> ? i : 0UL ) );
9897 const size_t kend( ( IsUpper_v<MT5> )
9898 ?( ( IsLower_v<MT4> )
9899 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
9900 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
9901 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
9907 SIMDType a1( A.load(i ,k) );
9908 SIMDType a2( A.load(i+
SIMDSIZE,k) );
9909 SIMDType b1(
set( B(k,j ) ) );
9910 SIMDType b2(
set( B(k,j+1UL) ) );
9911 SIMDType xmm1( a1 * b1 );
9912 SIMDType xmm2( a2 * b1 );
9913 SIMDType xmm3( a1 * b2 );
9914 SIMDType xmm4( a2 * b2 );
9916 for( ++k; k<kend; ++k ) {
9919 b1 =
set( B(k,j ) );
9920 b2 =
set( B(k,j+1UL) );
9927 C.store( i , j , xmm1 * factor );
9928 C.store( i+
SIMDSIZE, j , xmm2 * factor );
9929 C.store( i , j+1UL, xmm3 * factor );
9930 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
9934 const SIMDType
zero;
9935 C.store( i , j ,
zero );
9937 C.store( i , j+1UL,
zero );
9944 const size_t kbegin( ( IsLower_v<MT5> )
9945 ?( ( IsUpper_v<MT4> )
9946 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9947 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9948 :( IsUpper_v<MT4> ? i : 0UL ) );
9949 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
9955 SIMDType b1(
set( B(k,j) ) );
9956 SIMDType xmm1( A.load(i ,k) * b1 );
9957 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
9959 for( ++k; k<kend; ++k ) {
9961 xmm1 += A.load(i ,k) * b1;
9965 C.store( i , j, xmm1 * factor );
9966 C.store( i+
SIMDSIZE, j, xmm2 * factor );
9970 const SIMDType
zero;
9971 C.store( i , j,
zero );
9981 for(
size_t ii=i; ii<iiend; ++ii ) {
9996 for(
size_t ii=i; ii<iiend; ++ii ) {
9997 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
10003 for( ; j<i; ++j ) {
10004 for(
size_t ii=i; ii<iiend; ++ii ) {
10010 for( ; (j+4UL) <= jend; j+=4UL )
10012 const size_t kbegin( ( IsLower_v<MT5> )
10013 ?( ( IsUpper_v<MT4> )
10014 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10015 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10016 :( IsUpper_v<MT4> ? i : 0UL ) );
10017 const size_t kend( ( IsUpper_v<MT5> )
10018 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
10021 size_t k( kbegin );
10025 SIMDType a1( A.load(i,k) );
10026 SIMDType xmm1( a1 *
set( B(k,j ) ) );
10027 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
10028 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
10029 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
10031 for( ++k; k<kend; ++k ) {
10033 xmm1 += a1 *
set( B(k,j ) );
10034 xmm2 += a1 *
set( B(k,j+1UL) );
10035 xmm3 += a1 *
set( B(k,j+2UL) );
10036 xmm4 += a1 *
set( B(k,j+3UL) );
10039 C.store( i, j , xmm1 * factor );
10040 C.store( i, j+1UL, xmm2 * factor );
10041 C.store( i, j+2UL, xmm3 * factor );
10042 C.store( i, j+3UL, xmm4 * factor );
10046 const SIMDType
zero;
10047 C.store( i, j ,
zero );
10048 C.store( i, j+1UL,
zero );
10049 C.store( i, j+2UL,
zero );
10050 C.store( i, j+3UL,
zero );
10054 for( ; (j+3UL) <= jend; j+=3UL )
10056 const size_t kbegin( ( IsLower_v<MT5> )
10057 ?( ( IsUpper_v<MT4> )
10058 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10059 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10060 :( IsUpper_v<MT4> ? i : 0UL ) );
10061 const size_t kend( ( IsUpper_v<MT5> )
10062 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
10065 size_t k( kbegin );
10069 SIMDType a1( A.load(i,k) );
10070 SIMDType xmm1( a1 *
set( B(k,j ) ) );
10071 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
10072 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
10074 for( ++k; k<kend; ++k ) {
10076 xmm1 += a1 *
set( B(k,j ) );
10077 xmm2 += a1 *
set( B(k,j+1UL) );
10078 xmm3 += a1 *
set( B(k,j+2UL) );
10081 C.store( i, j , xmm1 * factor );
10082 C.store( i, j+1UL, xmm2 * factor );
10083 C.store( i, j+2UL, xmm3 * factor );
10087 const SIMDType
zero;
10088 C.store( i, j ,
zero );
10089 C.store( i, j+1UL,
zero );
10090 C.store( i, j+2UL,
zero );
10094 for( ; (j+2UL) <= jend; j+=2UL )
10096 const size_t kbegin( ( IsLower_v<MT5> )
10097 ?( ( IsUpper_v<MT4> )
10098 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10099 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10100 :( IsUpper_v<MT4> ? i : 0UL ) );
10101 const size_t kend( ( IsUpper_v<MT5> )
10102 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10105 size_t k( kbegin );
10109 SIMDType a1( A.load(i,k) );
10110 SIMDType xmm1( a1 *
set( B(k,j ) ) );
10111 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
10113 for( ++k; k<kend; ++k ) {
10115 xmm1 += a1 *
set( B(k,j ) );
10116 xmm2 += a1 *
set( B(k,j+1UL) );
10119 C.store( i, j , xmm1 * factor );
10120 C.store( i, j+1UL, xmm2 * factor );
10124 const SIMDType
zero;
10125 C.store( i, j ,
zero );
10126 C.store( i, j+1UL,
zero );
10132 const size_t kbegin( ( IsLower_v<MT5> )
10133 ?( ( IsUpper_v<MT4> )
10134 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10135 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10136 :( IsUpper_v<MT4> ? i : 0UL ) );
10138 size_t k( kbegin );
10142 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
10144 for( ++k; k<K; ++k ) {
10145 xmm1 += A.load(i,k) *
set( B(k,j) );
10148 C.store( i, j, xmm1 * factor );
10152 const SIMDType
zero;
10153 C.store( i, j,
zero );
10161 for( ; j<N; ++j ) {
10162 for(
size_t ii=i; ii<iiend; ++ii ) {
10169 for( ; remainder && i<M; ++i )
10173 if( SYM || HERM ) {
10174 for( ; j<i; ++j ) {
10175 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
10179 for( ; j<i; ++j ) {
10184 for( ; (j+2UL) <= N; j+=2UL )
10186 const size_t kbegin( ( IsLower_v<MT5> )
10187 ?( ( IsUpper_v<MT4> )
10188 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10189 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10190 :( IsUpper_v<MT4> ? i : 0UL ) );
10191 const size_t kend( ( IsUpper_v<MT5> )
10192 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
10195 size_t k( kbegin );
10202 for( ++k; k<kend; ++k ) {
10203 value1 += A(i,k) * B(k,j );
10204 value2 += A(i,k) * B(k,j+1UL);
10207 C(i,j ) = value1 * scalar;
10208 C(i,j+1UL) = value2 * scalar;
10213 reset( C(i,j+1UL) );
10219 const size_t kbegin( ( IsLower_v<MT5> )
10220 ?( ( IsUpper_v<MT4> )
10221 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
10222 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
10223 :( IsUpper_v<MT4> ? i : 0UL ) );
10225 size_t k( kbegin );
10231 for( ++k; k<K; ++k ) {
10232 value += A(i,k) * B(k,j);
10235 C(i,j) = value * scalar;
10260 template<
typename MT3
10264 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10265 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10267 selectDefaultAssignKernel( C, A, B, scalar );
10286 template<
typename MT3
10290 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10291 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10294 smmm( C, A, B, scalar );
10296 hmmm( C, A, B, scalar );
10298 lmmm( C, A, B, scalar, ST2(0) );
10300 ummm( C, A, B, scalar, ST2(0) );
10302 mmm( C, A, B, scalar, ST2(0) );
10320 template<
typename MT3
10324 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10325 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
10327 selectLargeAssignKernel( C, A, B, scalar );
10332#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
10346 template<
typename MT3
10350 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10351 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
10353 using ET = ElementType_t<MT3>;
10355 if( IsTriangular_v<MT4> ) {
10357 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
10359 else if( IsTriangular_v<MT5> ) {
10361 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
10364 gemm( C, A, B,
ET(scalar),
ET(0) );
10382 template<
typename MT
10384 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
10388 using TmpType = If_t< SO, ResultType, OppositeType >;
10400 const ForwardFunctor fwd;
10402 const TmpType tmp(
serial( rhs ) );
10403 assign( *lhs, fwd( tmp ) );
10419 template<
typename MT
10421 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
10428 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
10429 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
10431 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
10445 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
10460 template<
typename MT3
10464 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10466 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
10467 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
10468 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
10469 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
10470 selectSmallAddAssignKernel( C, A, B, scalar );
10472 selectBlasAddAssignKernel( C, A, B, scalar );
10490 template<
typename MT3
10494 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10495 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10498 addAssign( C, tmp );
10516 template<
typename MT3
10520 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10521 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10523 constexpr size_t block( BLOCK_SIZE );
10525 const size_t M( A.rows() );
10526 const size_t N( B.columns() );
10528 for(
size_t ii=0UL; ii<M; ii+=block ) {
10529 const size_t iend(
min( M, ii+block ) );
10530 for(
size_t jj=0UL; jj<N; jj+=block ) {
10531 const size_t jend(
min( N, jj+block ) );
10532 for(
size_t i=ii; i<iend; ++i )
10534 const size_t jbegin( ( IsUpper_v<MT4> )
10535 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
10537 const size_t jpos( ( IsLower_v<MT4> )
10538 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
10541 for(
size_t j=jbegin; j<jpos; ++j ) {
10542 C(i,j) += A(i,j) * B(j,j) * scalar;
10564 template<
typename MT3
10568 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10569 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10571 const size_t M( A.rows() );
10572 const size_t N( B.columns() );
10574 for(
size_t j=0UL; j<N; ++j )
10576 const size_t ibegin( ( IsLower_v<MT4> )
10577 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
10579 const size_t iend( ( IsUpper_v<MT4> )
10580 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
10584 const size_t inum( iend - ibegin );
10585 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
10588 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
10589 C(i ,j) += A(i ,j) * B(j,j) * scalar;
10590 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
10592 if( ipos < iend ) {
10593 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
10613 template<
typename MT3
10617 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10618 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10620 const size_t M( A.rows() );
10621 const size_t N( B.columns() );
10623 for(
size_t i=0UL; i<M; ++i )
10625 const size_t jbegin( ( IsUpper_v<MT5> )
10626 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
10628 const size_t jend( ( IsLower_v<MT5> )
10629 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
10633 const size_t jnum( jend - jbegin );
10634 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
10637 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
10638 C(i,j ) += A(i,i) * B(i,j ) * scalar;
10639 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
10641 if( jpos < jend ) {
10642 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
10662 template<
typename MT3
10666 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10667 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
10669 constexpr size_t block( BLOCK_SIZE );
10671 const size_t M( A.rows() );
10672 const size_t N( B.columns() );
10674 for(
size_t jj=0UL; jj<N; jj+=block ) {
10675 const size_t jend(
min( N, jj+block ) );
10676 for(
size_t ii=0UL; ii<M; ii+=block ) {
10677 const size_t iend(
min( M, ii+block ) );
10678 for(
size_t j=jj; j<jend; ++j )
10680 const size_t ibegin( ( IsLower_v<MT5> )
10681 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
10683 const size_t ipos( ( IsUpper_v<MT5> )
10684 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
10687 for(
size_t i=ibegin; i<ipos; ++i ) {
10688 C(i,j) += A(i,i) * B(i,j) * scalar;
10710 template<
typename MT3
10714 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10715 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
10717 for(
size_t i=0UL; i<A.rows(); ++i ) {
10718 C(i,i) += A(i,i) * B(i,i) * scalar;
10737 template<
typename MT3
10741 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10742 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10744 selectDefaultAddAssignKernel( C, A, B, scalar );
10763 template<
typename MT3
10767 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10768 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10770 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
10772 const size_t M( A.rows() );
10773 const size_t N( B.columns() );
10774 const size_t K( A.columns() );
10781 const SIMDType factor(
set( scalar ) );
10785 if( IsIntegral_v<ElementType> )
10788 for(
size_t i=0UL; i<M; ++i )
10790 const size_t kbegin( ( IsUpper_v<MT4> )
10791 ?( ( IsLower_v<MT5> )
10792 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10793 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10794 :( IsLower_v<MT5> ? j : 0UL ) );
10795 const size_t kend( ( IsLower_v<MT4> )
10796 ?( ( IsUpper_v<MT5> )
10797 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
10798 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
10799 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
10801 size_t k( kbegin );
10805 SIMDType a1(
set( A(i,k) ) );
10806 SIMDType xmm1( a1 * B.load(k,j ) );
10807 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
10808 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
10809 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
10810 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
10811 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
10812 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
10813 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
10815 for( ++k; k<kend; ++k ) {
10816 a1 =
set( A(i,k) );
10817 xmm1 += a1 * B.load(k,j );
10818 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10819 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10820 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
10821 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
10822 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
10823 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
10824 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
10827 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10844 for( ; (i+2UL) <= M; i+=2UL )
10846 const size_t kbegin( ( IsUpper_v<MT4> )
10847 ?( ( IsLower_v<MT5> )
10848 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10849 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10850 :( IsLower_v<MT5> ? j : 0UL ) );
10851 const size_t kend( ( IsLower_v<MT4> )
10852 ?( ( IsUpper_v<MT5> )
10853 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
10854 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10855 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
10857 size_t k( kbegin );
10861 SIMDType a1(
set( A(i ,k) ) );
10862 SIMDType a2(
set( A(i+1UL,k) ) );
10863 SIMDType b1( B.load(k,j ) );
10864 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
10865 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
10866 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
10867 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
10868 SIMDType xmm1 ( a1 * b1 );
10869 SIMDType xmm2 ( a1 * b2 );
10870 SIMDType xmm3 ( a1 * b3 );
10871 SIMDType xmm4 ( a1 * b4 );
10872 SIMDType xmm5 ( a1 * b5 );
10873 SIMDType xmm6 ( a2 * b1 );
10874 SIMDType xmm7 ( a2 * b2 );
10875 SIMDType xmm8 ( a2 * b3 );
10876 SIMDType xmm9 ( a2 * b4 );
10877 SIMDType xmm10( a2 * b5 );
10879 for( ++k; k<kend; ++k ) {
10880 a1 =
set( A(i ,k) );
10881 a2 =
set( A(i+1UL,k) );
10899 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
10904 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
10906 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
10907 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
10908 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
10914 const size_t kbegin( ( IsUpper_v<MT4> )
10915 ?( ( IsLower_v<MT5> )
10916 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10917 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10918 :( IsLower_v<MT5> ? j : 0UL ) );
10919 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
10921 size_t k( kbegin );
10925 SIMDType a1(
set( A(i,k) ) );
10926 SIMDType xmm1( a1 * B.load(k,j ) );
10927 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
10928 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
10929 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
10930 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
10932 for( ++k; k<kend; ++k ) {
10933 a1 =
set( A(i,k) );
10934 xmm1 += a1 * B.load(k,j );
10935 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
10936 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
10937 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
10938 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
10941 C.store( i, j , C.load(i,j ) + xmm1 * factor );
10954 for( ; (i+2UL) <= M; i+=2UL )
10956 const size_t kbegin( ( IsUpper_v<MT4> )
10957 ?( ( IsLower_v<MT5> )
10958 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
10959 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
10960 :( IsLower_v<MT5> ? j : 0UL ) );
10961 const size_t kend( ( IsLower_v<MT4> )
10962 ?( ( IsUpper_v<MT5> )
10963 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
10964 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
10965 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
10967 size_t k( kbegin );
10971 SIMDType a1(
set( A(i ,k) ) );
10972 SIMDType a2(
set( A(i+1UL,k) ) );
10973 SIMDType b1( B.load(k,j ) );
10974 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
10975 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
10976 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
10977 SIMDType xmm1( a1 * b1 );
10978 SIMDType xmm2( a1 * b2 );
10979 SIMDType xmm3( a1 * b3 );
10980 SIMDType xmm4( a1 * b4 );
10981 SIMDType xmm5( a2 * b1 );
10982 SIMDType xmm6( a2 * b2 );
10983 SIMDType xmm7( a2 * b3 );
10984 SIMDType xmm8( a2 * b4 );
10986 for( ++k; k<kend; ++k ) {
10987 a1 =
set( A(i ,k) );
10988 a2 =
set( A(i+1UL,k) );
11003 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11007 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
11009 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
11010 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
11016 const size_t kbegin( ( IsUpper_v<MT4> )
11017 ?( ( IsLower_v<MT5> )
11018 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11019 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11020 :( IsLower_v<MT5> ? j : 0UL ) );
11021 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
11023 size_t k( kbegin );
11027 SIMDType a1(
set( A(i,k) ) );
11028 SIMDType xmm1( a1 * B.load(k,j ) );
11029 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
11030 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
11031 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
11033 for( ++k; k<kend; ++k ) {
11034 a1 =
set( A(i,k) );
11035 xmm1 += a1 * B.load(k,j );
11036 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11037 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11038 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
11041 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11053 for( ; (i+2UL) <= M; i+=2UL )
11055 const size_t kbegin( ( IsUpper_v<MT4> )
11056 ?( ( IsLower_v<MT5> )
11057 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11058 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11059 :( IsLower_v<MT5> ? j : 0UL ) );
11060 const size_t kend( ( IsLower_v<MT4> )
11061 ?( ( IsUpper_v<MT5> )
11062 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
11063 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11064 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
11066 size_t k( kbegin );
11070 SIMDType a1(
set( A(i ,k) ) );
11071 SIMDType a2(
set( A(i+1UL,k) ) );
11072 SIMDType b1( B.load(k,j ) );
11073 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
11074 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
11075 SIMDType xmm1( a1 * b1 );
11076 SIMDType xmm2( a1 * b2 );
11077 SIMDType xmm3( a1 * b3 );
11078 SIMDType xmm4( a2 * b1 );
11079 SIMDType xmm5( a2 * b2 );
11080 SIMDType xmm6( a2 * b3 );
11082 for( ++k; k<kend; ++k ) {
11083 a1 =
set( A(i ,k) );
11084 a2 =
set( A(i+1UL,k) );
11096 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11099 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
11101 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
11107 const size_t kbegin( ( IsUpper_v<MT4> )
11108 ?( ( IsLower_v<MT5> )
11109 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11110 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11111 :( IsLower_v<MT5> ? j : 0UL ) );
11112 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
11114 size_t k( kbegin );
11118 SIMDType a1(
set( A(i,k) ) );
11119 SIMDType xmm1( a1 * B.load(k,j ) );
11120 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
11121 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
11123 for( ++k; k<kend; ++k ) {
11124 a1 =
set( A(i,k) );
11125 xmm1 += a1 * B.load(k,j );
11126 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
11127 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
11130 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11139 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
11140 size_t i( LOW ? j : 0UL );
11142 for( ; (i+4UL) <= iend; i+=4UL )
11144 const size_t kbegin( ( IsUpper_v<MT4> )
11145 ?( ( IsLower_v<MT5> )
11146 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11147 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11148 :( IsLower_v<MT5> ? j : 0UL ) );
11149 const size_t kend( ( IsLower_v<MT4> )
11150 ?( ( IsUpper_v<MT5> )
11151 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
11152 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
11153 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11155 size_t k( kbegin );
11159 SIMDType a1(
set( A(i ,k) ) );
11160 SIMDType a2(
set( A(i+1UL,k) ) );
11161 SIMDType a3(
set( A(i+2UL,k) ) );
11162 SIMDType a4(
set( A(i+3UL,k) ) );
11163 SIMDType b1( B.load(k,j ) );
11164 SIMDType b2( B.load(k,j+
SIMDSIZE) );
11165 SIMDType xmm1( a1 * b1 );
11166 SIMDType xmm2( a1 * b2 );
11167 SIMDType xmm3( a2 * b1 );
11168 SIMDType xmm4( a2 * b2 );
11169 SIMDType xmm5( a3 * b1 );
11170 SIMDType xmm6( a3 * b2 );
11171 SIMDType xmm7( a4 * b1 );
11172 SIMDType xmm8( a4 * b2 );
11174 for( ++k; k<kend; ++k ) {
11175 a1 =
set( A(i ,k) );
11176 a2 =
set( A(i+1UL,k) );
11177 a3 =
set( A(i+2UL,k) );
11178 a4 =
set( A(i+3UL,k) );
11191 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11193 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11195 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
11197 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
11202 for( ; (i+3UL) <= iend; i+=3UL )
11204 const size_t kbegin( ( IsUpper_v<MT4> )
11205 ?( ( IsLower_v<MT5> )
11206 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11207 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11208 :( IsLower_v<MT5> ? j : 0UL ) );
11209 const size_t kend( ( IsLower_v<MT4> )
11210 ?( ( IsUpper_v<MT5> )
11211 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
11212 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
11213 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11215 size_t k( kbegin );
11219 SIMDType a1(
set( A(i ,k) ) );
11220 SIMDType a2(
set( A(i+1UL,k) ) );
11221 SIMDType a3(
set( A(i+2UL,k) ) );
11222 SIMDType b1( B.load(k,j ) );
11223 SIMDType b2( B.load(k,j+
SIMDSIZE) );
11224 SIMDType xmm1( a1 * b1 );
11225 SIMDType xmm2( a1 * b2 );
11226 SIMDType xmm3( a2 * b1 );
11227 SIMDType xmm4( a2 * b2 );
11228 SIMDType xmm5( a3 * b1 );
11229 SIMDType xmm6( a3 * b2 );
11231 for( ++k; k<kend; ++k ) {
11232 a1 =
set( A(i ,k) );
11233 a2 =
set( A(i+1UL,k) );
11234 a3 =
set( A(i+2UL,k) );
11245 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11247 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11249 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
11254 for( ; (i+2UL) <= iend; i+=2UL )
11256 const size_t kbegin( ( IsUpper_v<MT4> )
11257 ?( ( IsLower_v<MT5> )
11258 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11259 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11260 :( IsLower_v<MT5> ? j : 0UL ) );
11261 const size_t kend( ( IsLower_v<MT4> )
11262 ?( ( IsUpper_v<MT5> )
11263 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
11264 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
11265 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
11267 size_t k( kbegin );
11271 SIMDType a1(
set( A(i ,k) ) );
11272 SIMDType a2(
set( A(i+1UL,k) ) );
11273 SIMDType b1( B.load(k,j ) );
11274 SIMDType b2( B.load(k,j+
SIMDSIZE) );
11275 SIMDType xmm1( a1 * b1 );
11276 SIMDType xmm2( a1 * b2 );
11277 SIMDType xmm3( a2 * b1 );
11278 SIMDType xmm4( a2 * b2 );
11280 for( ++k; k<kend; ++k ) {
11281 a1 =
set( A(i ,k) );
11282 a2 =
set( A(i+1UL,k) );
11291 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11293 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
11300 const size_t kbegin( ( IsUpper_v<MT4> )
11301 ?( ( IsLower_v<MT5> )
11302 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11303 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11304 :( IsLower_v<MT5> ? j : 0UL ) );
11305 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
11307 size_t k( kbegin );
11311 SIMDType a1(
set( A(i,k) ) );
11312 SIMDType xmm1( a1 * B.load(k,j ) );
11313 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
11315 for( ++k; k<kend; ++k ) {
11316 a1 =
set( A(i,k) );
11317 xmm1 += a1 * B.load(k,j );
11318 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
11321 C.store( i, j , C.load(i,j ) + xmm1 * factor );
11329 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
11330 size_t i( LOW ? j : 0UL );
11332 for( ; (i+4UL) <= iend; i+=4UL )
11334 const size_t kbegin( ( IsUpper_v<MT4> )
11335 ?( ( IsLower_v<MT5> )
11336 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11337 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11338 :( IsLower_v<MT5> ? j : 0UL ) );
11339 const size_t kend( ( IsLower_v<MT4> )
11340 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
11343 size_t k( kbegin );
11347 SIMDType b1( B.load(k,j) );
11348 SIMDType xmm1(
set( A(i ,k) ) * b1 );
11349 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
11350 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
11351 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
11353 for( ++k; k<kend; ++k ) {
11355 xmm1 +=
set( A(i ,k) ) * b1;
11356 xmm2 +=
set( A(i+1UL,k) ) * b1;
11357 xmm3 +=
set( A(i+2UL,k) ) * b1;
11358 xmm4 +=
set( A(i+3UL,k) ) * b1;
11361 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11362 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11363 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
11364 C.store( i+3UL, j, C.load(i+3UL,j) + xmm4 * factor );
11368 for( ; (i+3UL) <= iend; i+=3UL )
11370 const size_t kbegin( ( IsUpper_v<MT4> )
11371 ?( ( IsLower_v<MT5> )
11372 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11373 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11374 :( IsLower_v<MT5> ? j : 0UL ) );
11375 const size_t kend( ( IsLower_v<MT4> )
11376 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
11379 size_t k( kbegin );
11383 SIMDType b1( B.load(k,j) );
11384 SIMDType xmm1(
set( A(i ,k) ) * b1 );
11385 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
11386 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
11388 for( ++k; k<kend; ++k ) {
11390 xmm1 +=
set( A(i ,k) ) * b1;
11391 xmm2 +=
set( A(i+1UL,k) ) * b1;
11392 xmm3 +=
set( A(i+2UL,k) ) * b1;
11395 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11396 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11397 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
11401 for( ; (i+2UL) <= iend; i+=2UL )
11403 const size_t kbegin( ( IsUpper_v<MT4> )
11404 ?( ( IsLower_v<MT5> )
11405 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11406 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11407 :( IsLower_v<MT5> ? j : 0UL ) );
11408 const size_t kend( ( IsLower_v<MT4> )
11409 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11412 size_t k( kbegin );
11416 SIMDType b1( B.load(k,j) );
11417 SIMDType xmm1(
set( A(i ,k) ) * b1 );
11418 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
11420 for( ++k; k<kend; ++k ) {
11422 xmm1 +=
set( A(i ,k) ) * b1;
11423 xmm2 +=
set( A(i+1UL,k) ) * b1;
11426 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11427 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
11433 const size_t kbegin( ( IsUpper_v<MT4> )
11434 ?( ( IsLower_v<MT5> )
11435 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11436 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11437 :( IsLower_v<MT5> ? j : 0UL ) );
11439 size_t k( kbegin );
11443 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
11445 for( ++k; k<K; ++k ) {
11446 xmm1 +=
set( A(i,k) ) * B.load(k,j);
11449 C.store( i, j, C.load(i,j) + xmm1 * factor );
11454 for( ; remainder && j<N; ++j )
11456 const size_t iend( UPP ? j+1UL : M );
11457 size_t i( LOW ? j : 0UL );
11459 for( ; (i+2UL) <= iend; i+=2UL )
11461 const size_t kbegin( ( IsUpper_v<MT4> )
11462 ?( ( IsLower_v<MT5> )
11463 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11464 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11465 :( IsLower_v<MT5> ? j : 0UL ) );
11466 const size_t kend( ( IsLower_v<MT4> )
11467 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
11470 size_t k( kbegin );
11477 for( ++k; k<kend; ++k ) {
11478 value1 += A(i ,k) * B(k,j);
11479 value2 += A(i+1UL,k) * B(k,j);
11482 C(i ,j) += value1 * scalar;
11483 C(i+1UL,j) += value2 * scalar;
11489 const size_t kbegin( ( IsUpper_v<MT4> )
11490 ?( ( IsLower_v<MT5> )
11491 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
11492 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
11493 :( IsLower_v<MT5> ? j : 0UL ) );
11495 size_t k( kbegin );
11501 for( ++k; k<K; ++k ) {
11502 value += A(i,k) * B(k,j);
11505 C(i,j) += value * scalar;
11527 template<
typename MT3
11531 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11532 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11534 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
11536 const size_t M( A.rows() );
11537 const size_t N( B.columns() );
11538 const size_t K( A.columns() );
11545 const SIMDType factor(
set( scalar ) );
11549 if( IsIntegral_v<ElementType> )
11552 for(
size_t j=0UL; j<N; ++j )
11554 const size_t kbegin( ( IsLower_v<MT5> )
11555 ?( ( IsUpper_v<MT4> )
11556 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11557 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11558 :( IsUpper_v<MT4> ? i : 0UL ) );
11559 const size_t kend( ( IsUpper_v<MT5> )
11560 ?( ( IsLower_v<MT4> )
11561 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
11562 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
11563 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
11565 size_t k( kbegin );
11569 SIMDType b1(
set( B(k,j) ) );
11570 SIMDType xmm1( A.load(i ,k) * b1 );
11571 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
11572 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
11573 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
11574 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
11575 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
11576 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
11577 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
11579 for( ++k; k<kend; ++k ) {
11580 b1 =
set( B(k,j) );
11581 xmm1 += A.load(i ,k) * b1;
11582 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
11583 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
11584 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
11585 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
11586 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
11587 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
11588 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
11591 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11608 for( ; (j+2UL) <= N; j+=2UL )
11610 const size_t kbegin( ( IsLower_v<MT5> )
11611 ?( ( IsUpper_v<MT4> )
11612 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11613 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11614 :( IsUpper_v<MT4> ? i : 0UL ) );
11615 const size_t kend( ( IsUpper_v<MT5> )
11616 ?( ( IsLower_v<MT4> )
11617 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11618 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11619 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
11621 size_t k( kbegin );
11625 SIMDType a1( A.load(i ,k) );
11626 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
11627 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
11628 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
11629 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
11630 SIMDType b1(
set( B(k,j ) ) );
11631 SIMDType b2(
set( B(k,j+1UL) ) );
11632 SIMDType xmm1 ( a1 * b1 );
11633 SIMDType xmm2 ( a2 * b1 );
11634 SIMDType xmm3 ( a3 * b1 );
11635 SIMDType xmm4 ( a4 * b1 );
11636 SIMDType xmm5 ( a5 * b1 );
11637 SIMDType xmm6 ( a1 * b2 );
11638 SIMDType xmm7 ( a2 * b2 );
11639 SIMDType xmm8 ( a3 * b2 );
11640 SIMDType xmm9 ( a4 * b2 );
11641 SIMDType xmm10( a5 * b2 );
11643 for( ++k; k<kend; ++k ) {
11649 b1 =
set( B(k,j ) );
11650 b2 =
set( B(k,j+1UL) );
11663 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11668 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
11670 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
11671 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
11672 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
11678 const size_t kbegin( ( IsLower_v<MT5> )
11679 ?( ( IsUpper_v<MT4> )
11680 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11681 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11682 :( IsUpper_v<MT4> ? i : 0UL ) );
11683 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
11685 size_t k( kbegin );
11689 SIMDType b1(
set( B(k,j) ) );
11690 SIMDType xmm1( A.load(i ,k) * b1 );
11691 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
11692 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
11693 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
11694 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
11696 for( ++k; k<kend; ++k ) {
11697 b1 =
set( B(k,j) );
11698 xmm1 += A.load(i ,k) * b1;
11699 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
11700 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
11701 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
11702 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
11705 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11718 for( ; (j+2UL) <= N; j+=2UL )
11720 const size_t kbegin( ( IsLower_v<MT5> )
11721 ?( ( IsUpper_v<MT4> )
11722 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11723 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11724 :( IsUpper_v<MT4> ? i : 0UL ) );
11725 const size_t kend( ( IsUpper_v<MT5> )
11726 ?( ( IsLower_v<MT4> )
11727 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11728 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11729 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
11731 size_t k( kbegin );
11735 SIMDType a1( A.load(i ,k) );
11736 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
11737 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
11738 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
11739 SIMDType b1(
set( B(k,j ) ) );
11740 SIMDType b2(
set( B(k,j+1UL) ) );
11741 SIMDType xmm1( a1 * b1 );
11742 SIMDType xmm2( a2 * b1 );
11743 SIMDType xmm3( a3 * b1 );
11744 SIMDType xmm4( a4 * b1 );
11745 SIMDType xmm5( a1 * b2 );
11746 SIMDType xmm6( a2 * b2 );
11747 SIMDType xmm7( a3 * b2 );
11748 SIMDType xmm8( a4 * b2 );
11750 for( ++k; k<kend; ++k ) {
11755 b1 =
set( B(k,j ) );
11756 b2 =
set( B(k,j+1UL) );
11767 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11771 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
11773 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
11774 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
11780 const size_t kbegin( ( IsLower_v<MT5> )
11781 ?( ( IsUpper_v<MT4> )
11782 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11783 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11784 :( IsUpper_v<MT4> ? i : 0UL ) );
11785 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
11787 size_t k( kbegin );
11791 SIMDType b1(
set( B(k,j) ) );
11792 SIMDType xmm1( A.load(i ,k) * b1 );
11793 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
11794 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
11795 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
11797 for( ++k; k<kend; ++k ) {
11798 b1 =
set( B(k,j) );
11799 xmm1 += A.load(i ,k) * b1;
11800 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
11801 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
11802 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
11805 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11817 for( ; (j+2UL) <= N; j+=2UL )
11819 const size_t kbegin( ( IsLower_v<MT5> )
11820 ?( ( IsUpper_v<MT4> )
11821 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11822 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11823 :( IsUpper_v<MT4> ? i : 0UL ) );
11824 const size_t kend( ( IsUpper_v<MT5> )
11825 ?( ( IsLower_v<MT4> )
11826 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
11827 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
11828 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
11830 size_t k( kbegin );
11834 SIMDType a1( A.load(i ,k) );
11835 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
11836 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
11837 SIMDType b1(
set( B(k,j ) ) );
11838 SIMDType b2(
set( B(k,j+1UL) ) );
11839 SIMDType xmm1( a1 * b1 );
11840 SIMDType xmm2( a2 * b1 );
11841 SIMDType xmm3( a3 * b1 );
11842 SIMDType xmm4( a1 * b2 );
11843 SIMDType xmm5( a2 * b2 );
11844 SIMDType xmm6( a3 * b2 );
11846 for( ++k; k<kend; ++k ) {
11850 b1 =
set( B(k,j ) );
11851 b2 =
set( B(k,j+1UL) );
11860 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11863 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
11865 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
11871 const size_t kbegin( ( IsLower_v<MT5> )
11872 ?( ( IsUpper_v<MT4> )
11873 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11874 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11875 :( IsUpper_v<MT4> ? i : 0UL ) );
11876 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
11878 size_t k( kbegin );
11882 SIMDType b1(
set( B(k,j) ) );
11883 SIMDType xmm1( A.load(i ,k) * b1 );
11884 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
11885 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
11887 for( ++k; k<kend; ++k ) {
11888 b1 =
set( B(k,j) );
11889 xmm1 += A.load(i ,k) * b1;
11890 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
11891 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
11894 C.store( i , j, C.load(i ,j) + xmm1 * factor );
11903 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
11904 size_t j( UPP ? i : 0UL );
11906 for( ; (j+4UL) <= jend; j+=4UL )
11908 const size_t kbegin( ( IsLower_v<MT5> )
11909 ?( ( IsUpper_v<MT4> )
11910 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11911 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11912 :( IsUpper_v<MT4> ? i : 0UL ) );
11913 const size_t kend( ( IsUpper_v<MT5> )
11914 ?( ( IsLower_v<MT4> )
11915 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
11916 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
11917 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
11919 size_t k( kbegin );
11923 SIMDType a1( A.load(i ,k) );
11924 SIMDType a2( A.load(i+
SIMDSIZE,k) );
11925 SIMDType b1(
set( B(k,j ) ) );
11926 SIMDType b2(
set( B(k,j+1UL) ) );
11927 SIMDType b3(
set( B(k,j+2UL) ) );
11928 SIMDType b4(
set( B(k,j+3UL) ) );
11929 SIMDType xmm1( a1 * b1 );
11930 SIMDType xmm2( a2 * b1 );
11931 SIMDType xmm3( a1 * b2 );
11932 SIMDType xmm4( a2 * b2 );
11933 SIMDType xmm5( a1 * b3 );
11934 SIMDType xmm6( a2 * b3 );
11935 SIMDType xmm7( a1 * b4 );
11936 SIMDType xmm8( a2 * b4 );
11938 for( ++k; k<kend; ++k ) {
11941 b1 =
set( B(k,j ) );
11942 b2 =
set( B(k,j+1UL) );
11943 b3 =
set( B(k,j+2UL) );
11944 b4 =
set( B(k,j+3UL) );
11955 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
11957 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
11959 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
11961 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
11966 for( ; (j+3UL) <= jend; j+=3UL )
11968 const size_t kbegin( ( IsLower_v<MT5> )
11969 ?( ( IsUpper_v<MT4> )
11970 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
11971 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
11972 :( IsUpper_v<MT4> ? i : 0UL ) );
11973 const size_t kend( ( IsUpper_v<MT5> )
11974 ?( ( IsLower_v<MT4> )
11975 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
11976 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
11977 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
11979 size_t k( kbegin );
11983 SIMDType a1( A.load(i ,k) );
11984 SIMDType a2( A.load(i+
SIMDSIZE,k) );
11985 SIMDType b1(
set( B(k,j ) ) );
11986 SIMDType b2(
set( B(k,j+1UL) ) );
11987 SIMDType b3(
set( B(k,j+2UL) ) );
11988 SIMDType xmm1( a1 * b1 );
11989 SIMDType xmm2( a2 * b1 );
11990 SIMDType xmm3( a1 * b2 );
11991 SIMDType xmm4( a2 * b2 );
11992 SIMDType xmm5( a1 * b3 );
11993 SIMDType xmm6( a2 * b3 );
11995 for( ++k; k<kend; ++k ) {
11998 b1 =
set( B(k,j ) );
11999 b2 =
set( B(k,j+1UL) );
12000 b3 =
set( B(k,j+2UL) );
12009 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
12011 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
12013 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
12018 for( ; (j+2UL) <= jend; j+=2UL )
12020 const size_t kbegin( ( IsLower_v<MT5> )
12021 ?( ( IsUpper_v<MT4> )
12022 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12023 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12024 :( IsUpper_v<MT4> ? i : 0UL ) );
12025 const size_t kend( ( IsUpper_v<MT5> )
12026 ?( ( IsLower_v<MT4> )
12027 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
12028 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
12029 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
12031 size_t k( kbegin );
12035 SIMDType a1( A.load(i ,k) );
12036 SIMDType a2( A.load(i+
SIMDSIZE,k) );
12037 SIMDType b1(
set( B(k,j ) ) );
12038 SIMDType b2(
set( B(k,j+1UL) ) );
12039 SIMDType xmm1( a1 * b1 );
12040 SIMDType xmm2( a2 * b1 );
12041 SIMDType xmm3( a1 * b2 );
12042 SIMDType xmm4( a2 * b2 );
12044 for( ++k; k<kend; ++k ) {
12047 b1 =
set( B(k,j ) );
12048 b2 =
set( B(k,j+1UL) );
12055 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
12057 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
12064 const size_t kbegin( ( IsLower_v<MT5> )
12065 ?( ( IsUpper_v<MT4> )
12066 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12067 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12068 :( IsUpper_v<MT4> ? i : 0UL ) );
12069 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
12071 size_t k( kbegin );
12075 SIMDType b1(
set( B(k,j) ) );
12076 SIMDType xmm1( A.load(i ,k) * b1 );
12077 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
12079 for( ++k; k<kend; ++k ) {
12080 b1 =
set( B(k,j) );
12081 xmm1 += A.load(i ,k) * b1;
12082 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
12085 C.store( i , j, C.load(i ,j) + xmm1 * factor );
12093 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
12094 size_t j( UPP ? i : 0UL );
12096 for( ; (j+4UL) <= jend; j+=4UL )
12098 const size_t kbegin( ( IsLower_v<MT5> )
12099 ?( ( IsUpper_v<MT4> )
12100 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12101 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12102 :( IsUpper_v<MT4> ? i : 0UL ) );
12103 const size_t kend( ( IsUpper_v<MT5> )
12104 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
12107 size_t k( kbegin );
12111 SIMDType a1( A.load(i,k) );
12112 SIMDType xmm1( a1 *
set( B(k,j ) ) );
12113 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
12114 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
12115 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
12117 for( ++k; k<kend; ++k ) {
12119 xmm1 += a1 *
set( B(k,j ) );
12120 xmm2 += a1 *
set( B(k,j+1UL) );
12121 xmm3 += a1 *
set( B(k,j+2UL) );
12122 xmm4 += a1 *
set( B(k,j+3UL) );
12125 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12126 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12127 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
12128 C.store( i, j+3UL, C.load(i,j+3UL) + xmm4 * factor );
12132 for( ; (j+3UL) <= jend; j+=3UL )
12134 const size_t kbegin( ( IsLower_v<MT5> )
12135 ?( ( IsUpper_v<MT4> )
12136 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12137 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12138 :( IsUpper_v<MT4> ? i : 0UL ) );
12139 const size_t kend( ( IsUpper_v<MT5> )
12140 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
12143 size_t k( kbegin );
12147 SIMDType a1( A.load(i,k) );
12148 SIMDType xmm1( a1 *
set( B(k,j ) ) );
12149 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
12150 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
12152 for( ++k; k<kend; ++k ) {
12154 xmm1 += a1 *
set( B(k,j ) );
12155 xmm2 += a1 *
set( B(k,j+1UL) );
12156 xmm3 += a1 *
set( B(k,j+2UL) );
12159 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12160 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12161 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
12165 for( ; (j+2UL) <= jend; j+=2UL )
12167 const size_t kbegin( ( IsLower_v<MT5> )
12168 ?( ( IsUpper_v<MT4> )
12169 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12170 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12171 :( IsUpper_v<MT4> ? i : 0UL ) );
12172 const size_t kend( ( IsUpper_v<MT5> )
12173 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12176 size_t k( kbegin );
12180 SIMDType a1( A.load(i,k) );
12181 SIMDType xmm1( a1 *
set( B(k,j ) ) );
12182 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
12184 for( ++k; k<kend; ++k ) {
12186 xmm1 += a1 *
set( B(k,j ) );
12187 xmm2 += a1 *
set( B(k,j+1UL) );
12190 C.store( i, j , C.load(i,j ) + xmm1 * factor );
12191 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
12197 const size_t kbegin( ( IsLower_v<MT5> )
12198 ?( ( IsUpper_v<MT4> )
12199 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12200 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12201 :( IsUpper_v<MT4> ? i : 0UL ) );
12203 size_t k( kbegin );
12207 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
12209 for( ++k; k<K; ++k ) {
12210 xmm1 += A.load(i,k) *
set( B(k,j) );
12213 C.store( i, j, C.load(i,j) + xmm1 * factor );
12218 for( ; remainder && i<M; ++i )
12220 const size_t jend( LOW ? i+1UL : N );
12221 size_t j( UPP ? i : 0UL );
12223 for( ; (j+2UL) <= jend; j+=2UL )
12225 const size_t kbegin( ( IsLower_v<MT5> )
12226 ?( ( IsUpper_v<MT4> )
12227 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12228 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12229 :( IsUpper_v<MT4> ? i : 0UL ) );
12230 const size_t kend( ( IsUpper_v<MT5> )
12231 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
12234 size_t k( kbegin );
12241 for( ++k; k<kend; ++k ) {
12242 value1 += A(i,k) * B(k,j );
12243 value2 += A(i,k) * B(k,j+1UL);
12246 C(i,j ) += value1 * scalar;
12247 C(i,j+1UL) += value2 * scalar;
12253 const size_t kbegin( ( IsLower_v<MT5> )
12254 ?( ( IsUpper_v<MT4> )
12255 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
12256 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
12257 :( IsUpper_v<MT4> ? i : 0UL ) );
12259 size_t k( kbegin );
12265 for( ++k; k<K; ++k ) {
12266 value += A(i,k) * B(k,j);
12269 C(i,j) += value * scalar;
12290 template<
typename MT3
12294 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12295 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12297 selectDefaultAddAssignKernel( C, A, B, scalar );
12316 template<
typename MT3
12320 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12321 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12324 lmmm( C, A, B, scalar, ST2(1) );
12326 ummm( C, A, B, scalar, ST2(1) );
12328 mmm( C, A, B, scalar, ST2(1) );
12346 template<
typename MT3
12350 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12351 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12353 selectLargeAddAssignKernel( C, A, B, scalar );
12358#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12372 template<
typename MT3
12376 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12377 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12379 using ET = ElementType_t<MT3>;
12381 if( IsTriangular_v<MT4> ) {
12382 ResultType_t<MT3> tmp(
serial( B ) );
12383 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12384 addAssign( C, tmp );
12386 else if( IsTriangular_v<MT5> ) {
12387 ResultType_t<MT3> tmp(
serial( A ) );
12388 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12389 addAssign( C, tmp );
12392 gemm( C, A, B,
ET(scalar),
ET(1) );
12414 template<
typename MT
12416 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
12423 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12424 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12426 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
12440 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
12455 template<
typename MT3
12459 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12461 if( ( IsDiagonal_v<MT4> && IsDiagonal_v<MT5> ) ||
12462 ( !BLAZE_DEBUG_MODE && IsRowMajorMatrix_v<MT3> && B.columns() <=
SIMDSIZE*10UL ) ||
12463 ( !BLAZE_DEBUG_MODE && IsColumnMajorMatrix_v<MT3> && A.rows() <=
SIMDSIZE*10UL ) ||
12464 ( C.rows() * C.columns() < TDMATDMATMULT_THRESHOLD ) )
12465 selectSmallSubAssignKernel( C, A, B, scalar );
12467 selectBlasSubAssignKernel( C, A, B, scalar );
12485 template<
typename MT3
12489 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12490 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12493 subAssign( C, tmp );
12511 template<
typename MT3
12515 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12516 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12518 constexpr size_t block( BLOCK_SIZE );
12520 const size_t M( A.rows() );
12521 const size_t N( B.columns() );
12523 for(
size_t ii=0UL; ii<M; ii+=block ) {
12524 const size_t iend(
min( M, ii+block ) );
12525 for(
size_t jj=0UL; jj<N; jj+=block ) {
12526 const size_t jend(
min( N, jj+block ) );
12527 for(
size_t i=ii; i<iend; ++i )
12529 const size_t jbegin( ( IsUpper_v<MT4> )
12530 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), jj ) )
12532 const size_t jpos( ( IsLower_v<MT4> )
12533 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), jend ) )
12536 for(
size_t j=jbegin; j<jpos; ++j ) {
12537 C(i,j) -= A(i,j) * B(j,j) * scalar;
12559 template<
typename MT3
12563 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12564 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12566 const size_t M( A.rows() );
12567 const size_t N( B.columns() );
12569 for(
size_t j=0UL; j<N; ++j )
12571 const size_t ibegin( ( IsLower_v<MT4> )
12572 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
12574 const size_t iend( ( IsUpper_v<MT4> )
12575 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
12579 const size_t inum( iend - ibegin );
12580 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
12583 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
12584 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
12585 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
12587 if( ipos < iend ) {
12588 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
12608 template<
typename MT3
12612 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12613 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12615 const size_t M( A.rows() );
12616 const size_t N( B.columns() );
12618 for(
size_t i=0UL; i<M; ++i )
12620 const size_t jbegin( ( IsUpper_v<MT5> )
12621 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
12623 const size_t jend( ( IsLower_v<MT5> )
12624 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
12628 const size_t jnum( jend - jbegin );
12629 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
12632 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
12633 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
12634 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
12636 if( jpos < jend ) {
12637 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
12657 template<
typename MT3
12661 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12662 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
12664 constexpr size_t block( BLOCK_SIZE );
12666 const size_t M( A.rows() );
12667 const size_t N( B.columns() );
12669 for(
size_t jj=0UL; jj<N; jj+=block ) {
12670 const size_t jend(
min( N, jj+block ) );
12671 for(
size_t ii=0UL; ii<M; ii+=block ) {
12672 const size_t iend(
min( M, ii+block ) );
12673 for(
size_t j=jj; j<jend; ++j )
12675 const size_t ibegin( ( IsLower_v<MT5> )
12676 ?(
max( ( IsStrictlyLower_v<MT5> ? j+1UL : j ), ii ) )
12678 const size_t ipos( ( IsUpper_v<MT5> )
12679 ?(
min( ( IsStrictlyUpper_v<MT5> ? j : j+1UL ), iend ) )
12682 for(
size_t i=ibegin; i<ipos; ++i ) {
12683 C(i,j) -= A(i,i) * B(i,j) * scalar;
12705 template<
typename MT3
12709 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12710 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
12712 for(
size_t i=0UL; i<A.rows(); ++i ) {
12713 C(i,i) -= A(i,i) * B(i,i) * scalar;
12732 template<
typename MT3
12736 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12737 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12739 selectDefaultSubAssignKernel( C, A, B, scalar );
12758 template<
typename MT3
12762 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12763 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12765 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
12767 const size_t M( A.rows() );
12768 const size_t N( B.columns() );
12769 const size_t K( A.columns() );
12776 const SIMDType factor(
set( scalar ) );
12780 if( IsIntegral_v<ElementType> )
12783 for(
size_t i=0UL; i<M; ++i )
12785 const size_t kbegin( ( IsUpper_v<MT4> )
12786 ?( ( IsLower_v<MT5> )
12787 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12788 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12789 :( IsLower_v<MT5> ? j : 0UL ) );
12790 const size_t kend( ( IsLower_v<MT4> )
12791 ?( ( IsUpper_v<MT5> )
12792 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
12793 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
12794 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
12796 size_t k( kbegin );
12800 SIMDType a1(
set( A(i,k) ) );
12801 SIMDType xmm1( a1 * B.load(k,j ) );
12802 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
12803 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
12804 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
12805 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
12806 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
12807 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
12808 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
12810 for( ++k; k<kend; ++k ) {
12811 a1 =
set( A(i,k) );
12812 xmm1 += a1 * B.load(k,j );
12813 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
12814 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
12815 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
12816 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
12817 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
12818 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
12819 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
12822 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12839 for( ; (i+2UL) <= M; i+=2UL )
12841 const size_t kbegin( ( IsUpper_v<MT4> )
12842 ?( ( IsLower_v<MT5> )
12843 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12844 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12845 :( IsLower_v<MT5> ? j : 0UL ) );
12846 const size_t kend( ( IsLower_v<MT4> )
12847 ?( ( IsUpper_v<MT5> )
12848 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
12849 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12850 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
12852 size_t k( kbegin );
12856 SIMDType a1(
set( A(i ,k) ) );
12857 SIMDType a2(
set( A(i+1UL,k) ) );
12858 SIMDType b1( B.load(k,j ) );
12859 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
12860 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
12861 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
12862 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
12863 SIMDType xmm1 ( a1 * b1 );
12864 SIMDType xmm2 ( a1 * b2 );
12865 SIMDType xmm3 ( a1 * b3 );
12866 SIMDType xmm4 ( a1 * b4 );
12867 SIMDType xmm5 ( a1 * b5 );
12868 SIMDType xmm6 ( a2 * b1 );
12869 SIMDType xmm7 ( a2 * b2 );
12870 SIMDType xmm8 ( a2 * b3 );
12871 SIMDType xmm9 ( a2 * b4 );
12872 SIMDType xmm10( a2 * b5 );
12874 for( ++k; k<kend; ++k ) {
12875 a1 =
set( A(i ,k) );
12876 a2 =
set( A(i+1UL,k) );
12894 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
12899 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
12901 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
12902 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
12903 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
12909 const size_t kbegin( ( IsUpper_v<MT4> )
12910 ?( ( IsLower_v<MT5> )
12911 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12912 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12913 :( IsLower_v<MT5> ? j : 0UL ) );
12914 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
12916 size_t k( kbegin );
12920 SIMDType a1(
set( A(i,k) ) );
12921 SIMDType xmm1( a1 * B.load(k,j ) );
12922 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
12923 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
12924 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
12925 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
12927 for( ++k; k<kend; ++k ) {
12928 a1 =
set( A(i,k) );
12929 xmm1 += a1 * B.load(k,j );
12930 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
12931 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
12932 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
12933 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
12936 C.store( i, j , C.load(i,j ) - xmm1 * factor );
12949 for( ; (i+2UL) <= M; i+=2UL )
12951 const size_t kbegin( ( IsUpper_v<MT4> )
12952 ?( ( IsLower_v<MT5> )
12953 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
12954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
12955 :( IsLower_v<MT5> ? j : 0UL ) );
12956 const size_t kend( ( IsLower_v<MT4> )
12957 ?( ( IsUpper_v<MT5> )
12958 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
12959 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
12960 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
12962 size_t k( kbegin );
12966 SIMDType a1(
set( A(i ,k) ) );
12967 SIMDType a2(
set( A(i+1UL,k) ) );
12968 SIMDType b1( B.load(k,j ) );
12969 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
12970 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
12971 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
12972 SIMDType xmm1( a1 * b1 );
12973 SIMDType xmm2( a1 * b2 );
12974 SIMDType xmm3( a1 * b3 );
12975 SIMDType xmm4( a1 * b4 );
12976 SIMDType xmm5( a2 * b1 );
12977 SIMDType xmm6( a2 * b2 );
12978 SIMDType xmm7( a2 * b3 );
12979 SIMDType xmm8( a2 * b4 );
12981 for( ++k; k<kend; ++k ) {
12982 a1 =
set( A(i ,k) );
12983 a2 =
set( A(i+1UL,k) );
12998 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13002 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
13004 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
13005 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
13011 const size_t kbegin( ( IsUpper_v<MT4> )
13012 ?( ( IsLower_v<MT5> )
13013 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13014 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13015 :( IsLower_v<MT5> ? j : 0UL ) );
13016 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
13018 size_t k( kbegin );
13022 SIMDType a1(
set( A(i,k) ) );
13023 SIMDType xmm1( a1 * B.load(k,j ) );
13024 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
13025 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
13026 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
13028 for( ++k; k<kend; ++k ) {
13029 a1 =
set( A(i,k) );
13030 xmm1 += a1 * B.load(k,j );
13031 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
13032 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
13033 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
13036 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13048 for( ; (i+2UL) <= M; i+=2UL )
13050 const size_t kbegin( ( IsUpper_v<MT4> )
13051 ?( ( IsLower_v<MT5> )
13052 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13053 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13054 :( IsLower_v<MT5> ? j : 0UL ) );
13055 const size_t kend( ( IsLower_v<MT4> )
13056 ?( ( IsUpper_v<MT5> )
13057 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
13058 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
13059 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
13061 size_t k( kbegin );
13065 SIMDType a1(
set( A(i ,k) ) );
13066 SIMDType a2(
set( A(i+1UL,k) ) );
13067 SIMDType b1( B.load(k,j ) );
13068 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
13069 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
13070 SIMDType xmm1( a1 * b1 );
13071 SIMDType xmm2( a1 * b2 );
13072 SIMDType xmm3( a1 * b3 );
13073 SIMDType xmm4( a2 * b1 );
13074 SIMDType xmm5( a2 * b2 );
13075 SIMDType xmm6( a2 * b3 );
13077 for( ++k; k<kend; ++k ) {
13078 a1 =
set( A(i ,k) );
13079 a2 =
set( A(i+1UL,k) );
13091 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13094 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
13096 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
13102 const size_t kbegin( ( IsUpper_v<MT4> )
13103 ?( ( IsLower_v<MT5> )
13104 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13105 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13106 :( IsLower_v<MT5> ? j : 0UL ) );
13107 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
13109 size_t k( kbegin );
13113 SIMDType a1(
set( A(i,k) ) );
13114 SIMDType xmm1( a1 * B.load(k,j ) );
13115 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
13116 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
13118 for( ++k; k<kend; ++k ) {
13119 a1 =
set( A(i,k) );
13120 xmm1 += a1 * B.load(k,j );
13121 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
13122 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
13125 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13134 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
13135 size_t i( LOW ? j : 0UL );
13137 for( ; (i+4UL) <= iend; i+=4UL )
13139 const size_t kbegin( ( IsUpper_v<MT4> )
13140 ?( ( IsLower_v<MT5> )
13141 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13142 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13143 :( IsLower_v<MT5> ? j : 0UL ) );
13144 const size_t kend( ( IsLower_v<MT4> )
13145 ?( ( IsUpper_v<MT5> )
13146 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
13147 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
13148 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
13150 size_t k( kbegin );
13154 SIMDType a1(
set( A(i ,k) ) );
13155 SIMDType a2(
set( A(i+1UL,k) ) );
13156 SIMDType a3(
set( A(i+2UL,k) ) );
13157 SIMDType a4(
set( A(i+3UL,k) ) );
13158 SIMDType b1( B.load(k,j ) );
13159 SIMDType b2( B.load(k,j+
SIMDSIZE) );
13160 SIMDType xmm1( a1 * b1 );
13161 SIMDType xmm2( a1 * b2 );
13162 SIMDType xmm3( a2 * b1 );
13163 SIMDType xmm4( a2 * b2 );
13164 SIMDType xmm5( a3 * b1 );
13165 SIMDType xmm6( a3 * b2 );
13166 SIMDType xmm7( a4 * b1 );
13167 SIMDType xmm8( a4 * b2 );
13169 for( ++k; k<kend; ++k ) {
13170 a1 =
set( A(i ,k) );
13171 a2 =
set( A(i+1UL,k) );
13172 a3 =
set( A(i+2UL,k) );
13173 a4 =
set( A(i+3UL,k) );
13186 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13188 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13190 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
13192 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
13197 for( ; (i+3UL) <= iend; i+=3UL )
13199 const size_t kbegin( ( IsUpper_v<MT4> )
13200 ?( ( IsLower_v<MT5> )
13201 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13202 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13203 :( IsLower_v<MT5> ? j : 0UL ) );
13204 const size_t kend( ( IsLower_v<MT4> )
13205 ?( ( IsUpper_v<MT5> )
13206 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
13207 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
13208 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
13210 size_t k( kbegin );
13214 SIMDType a1(
set( A(i ,k) ) );
13215 SIMDType a2(
set( A(i+1UL,k) ) );
13216 SIMDType a3(
set( A(i+2UL,k) ) );
13217 SIMDType b1( B.load(k,j ) );
13218 SIMDType b2( B.load(k,j+
SIMDSIZE) );
13219 SIMDType xmm1( a1 * b1 );
13220 SIMDType xmm2( a1 * b2 );
13221 SIMDType xmm3( a2 * b1 );
13222 SIMDType xmm4( a2 * b2 );
13223 SIMDType xmm5( a3 * b1 );
13224 SIMDType xmm6( a3 * b2 );
13226 for( ++k; k<kend; ++k ) {
13227 a1 =
set( A(i ,k) );
13228 a2 =
set( A(i+1UL,k) );
13229 a3 =
set( A(i+2UL,k) );
13240 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13242 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13244 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
13249 for( ; (i+2UL) <= iend; i+=2UL )
13251 const size_t kbegin( ( IsUpper_v<MT4> )
13252 ?( ( IsLower_v<MT5> )
13253 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13254 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13255 :( IsLower_v<MT5> ? j : 0UL ) );
13256 const size_t kend( ( IsLower_v<MT4> )
13257 ?( ( IsUpper_v<MT5> )
13258 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
13259 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
13260 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
13262 size_t k( kbegin );
13266 SIMDType a1(
set( A(i ,k) ) );
13267 SIMDType a2(
set( A(i+1UL,k) ) );
13268 SIMDType b1( B.load(k,j ) );
13269 SIMDType b2( B.load(k,j+
SIMDSIZE) );
13270 SIMDType xmm1( a1 * b1 );
13271 SIMDType xmm2( a1 * b2 );
13272 SIMDType xmm3( a2 * b1 );
13273 SIMDType xmm4( a2 * b2 );
13275 for( ++k; k<kend; ++k ) {
13276 a1 =
set( A(i ,k) );
13277 a2 =
set( A(i+1UL,k) );
13286 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13288 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
13295 const size_t kbegin( ( IsUpper_v<MT4> )
13296 ?( ( IsLower_v<MT5> )
13297 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13298 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13299 :( IsLower_v<MT5> ? j : 0UL ) );
13300 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
13302 size_t k( kbegin );
13306 SIMDType a1(
set( A(i,k) ) );
13307 SIMDType xmm1( a1 * B.load(k,j ) );
13308 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
13310 for( ++k; k<kend; ++k ) {
13311 a1 =
set( A(i,k) );
13312 xmm1 += a1 * B.load(k,j );
13313 xmm2 += a1 * B.load(k,j+
SIMDSIZE);
13316 C.store( i, j , C.load(i,j ) - xmm1 * factor );
13324 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
13325 size_t i( LOW ? j : 0UL );
13327 for( ; (i+4UL) <= iend; i+=4UL )
13329 const size_t kbegin( ( IsUpper_v<MT4> )
13330 ?( ( IsLower_v<MT5> )
13331 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13332 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13333 :( IsLower_v<MT5> ? j : 0UL ) );
13334 const size_t kend( ( IsLower_v<MT4> )
13335 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
13338 size_t k( kbegin );
13342 SIMDType b1( B.load(k,j) );
13343 SIMDType xmm1(
set( A(i ,k) ) * b1 );
13344 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
13345 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
13346 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
13348 for( ++k; k<kend; ++k ) {
13350 xmm1 +=
set( A(i ,k) ) * b1;
13351 xmm2 +=
set( A(i+1UL,k) ) * b1;
13352 xmm3 +=
set( A(i+2UL,k) ) * b1;
13353 xmm4 +=
set( A(i+3UL,k) ) * b1;
13356 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13357 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13358 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
13359 C.store( i+3UL, j, C.load(i+3UL,j) - xmm4 * factor );
13363 for( ; (i+3UL) <= iend; i+=3UL )
13365 const size_t kbegin( ( IsUpper_v<MT4> )
13366 ?( ( IsLower_v<MT5> )
13367 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13368 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13369 :( IsLower_v<MT5> ? j : 0UL ) );
13370 const size_t kend( ( IsLower_v<MT4> )
13371 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
13374 size_t k( kbegin );
13378 SIMDType b1( B.load(k,j) );
13379 SIMDType xmm1(
set( A(i ,k) ) * b1 );
13380 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
13381 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
13383 for( ++k; k<kend; ++k ) {
13385 xmm1 +=
set( A(i ,k) ) * b1;
13386 xmm2 +=
set( A(i+1UL,k) ) * b1;
13387 xmm3 +=
set( A(i+2UL,k) ) * b1;
13390 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13391 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13392 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
13396 for( ; (i+2UL) <= iend; i+=2UL )
13398 const size_t kbegin( ( IsUpper_v<MT4> )
13399 ?( ( IsLower_v<MT5> )
13400 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13401 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13402 :( IsLower_v<MT5> ? j : 0UL ) );
13403 const size_t kend( ( IsLower_v<MT4> )
13404 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
13407 size_t k( kbegin );
13411 SIMDType b1( B.load(k,j) );
13412 SIMDType xmm1(
set( A(i ,k) ) * b1 );
13413 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
13415 for( ++k; k<kend; ++k ) {
13417 xmm1 +=
set( A(i ,k) ) * b1;
13418 xmm2 +=
set( A(i+1UL,k) ) * b1;
13421 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13422 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
13428 const size_t kbegin( ( IsUpper_v<MT4> )
13429 ?( ( IsLower_v<MT5> )
13430 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13431 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13432 :( IsLower_v<MT5> ? j : 0UL ) );
13434 size_t k( kbegin );
13438 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
13440 for( ++k; k<K; ++k ) {
13441 xmm1 +=
set( A(i,k) ) * B.load(k,j);
13444 C.store( i, j, C.load(i,j) - xmm1 * factor );
13449 for( ; remainder && j<N; ++j )
13451 const size_t iend( UPP ? j+1UL : M );
13452 size_t i( LOW ? j : 0UL );
13454 for( ; (i+2UL) <= iend; i+=2UL )
13456 const size_t kbegin( ( IsUpper_v<MT4> )
13457 ?( ( IsLower_v<MT5> )
13458 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13459 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13460 :( IsLower_v<MT5> ? j : 0UL ) );
13461 const size_t kend( ( IsLower_v<MT4> )
13462 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
13465 size_t k( kbegin );
13472 for( ++k; k<kend; ++k ) {
13473 value1 += A(i ,k) * B(k,j);
13474 value2 += A(i+1UL,k) * B(k,j);
13477 C(i ,j) -= value1 * scalar;
13478 C(i+1UL,j) -= value2 * scalar;
13484 const size_t kbegin( ( IsUpper_v<MT4> )
13485 ?( ( IsLower_v<MT5> )
13486 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
13487 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
13488 :( IsLower_v<MT5> ? j : 0UL ) );
13490 size_t k( kbegin );
13496 for( ++k; k<K; ++k ) {
13497 value += A(i,k) * B(k,j);
13500 C(i,j) -= value * scalar;
13522 template<
typename MT3
13526 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
13527 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
13529 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
13531 const size_t M( A.rows() );
13532 const size_t N( B.columns() );
13533 const size_t K( A.columns() );
13540 const SIMDType factor(
set( scalar ) );
13544 if( IsIntegral_v<ElementType> )
13547 for(
size_t j=0UL; j<N; ++j )
13549 const size_t kbegin( ( IsLower_v<MT5> )
13550 ?( ( IsUpper_v<MT4> )
13551 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13552 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13553 :( IsUpper_v<MT4> ? i : 0UL ) );
13554 const size_t kend( ( IsUpper_v<MT5> )
13555 ?( ( IsLower_v<MT4> )
13556 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
13557 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
13558 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
13560 size_t k( kbegin );
13564 SIMDType b1(
set( B(k,j) ) );
13565 SIMDType xmm1( A.load(i ,k) * b1 );
13566 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
13567 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
13568 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
13569 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
13570 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
13571 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
13572 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
13574 for( ++k; k<kend; ++k ) {
13575 b1 =
set( B(k,j) );
13576 xmm1 += A.load(i ,k) * b1;
13577 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
13578 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
13579 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
13580 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
13581 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
13582 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
13583 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
13586 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13603 for( ; (j+2UL) <= N; j+=2UL )
13605 const size_t kbegin( ( IsLower_v<MT5> )
13606 ?( ( IsUpper_v<MT4> )
13607 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13608 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13609 :( IsUpper_v<MT4> ? i : 0UL ) );
13610 const size_t kend( ( IsUpper_v<MT5> )
13611 ?( ( IsLower_v<MT4> )
13612 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13613 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13614 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
13616 size_t k( kbegin );
13620 SIMDType a1( A.load(i ,k) );
13621 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
13622 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
13623 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
13624 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
13625 SIMDType b1(
set( B(k,j ) ) );
13626 SIMDType b2(
set( B(k,j+1UL) ) );
13627 SIMDType xmm1 ( a1 * b1 );
13628 SIMDType xmm2 ( a2 * b1 );
13629 SIMDType xmm3 ( a3 * b1 );
13630 SIMDType xmm4 ( a4 * b1 );
13631 SIMDType xmm5 ( a5 * b1 );
13632 SIMDType xmm6 ( a1 * b2 );
13633 SIMDType xmm7 ( a2 * b2 );
13634 SIMDType xmm8 ( a3 * b2 );
13635 SIMDType xmm9 ( a4 * b2 );
13636 SIMDType xmm10( a5 * b2 );
13638 for( ++k; k<kend; ++k ) {
13644 b1 =
set( B(k,j ) );
13645 b2 =
set( B(k,j+1UL) );
13658 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13663 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
13665 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
13666 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
13667 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
13673 const size_t kbegin( ( IsLower_v<MT5> )
13674 ?( ( IsUpper_v<MT4> )
13675 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13676 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13677 :( IsUpper_v<MT4> ? i : 0UL ) );
13678 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
13680 size_t k( kbegin );
13684 SIMDType b1(
set( B(k,j) ) );
13685 SIMDType xmm1( A.load(i ,k) * b1 );
13686 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
13687 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
13688 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
13689 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
13691 for( ++k; k<kend; ++k ) {
13692 b1 =
set( B(k,j) );
13693 xmm1 += A.load(i ,k) * b1;
13694 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
13695 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
13696 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
13697 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
13700 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13713 for( ; (j+2UL) <= N; j+=2UL )
13715 const size_t kbegin( ( IsLower_v<MT5> )
13716 ?( ( IsUpper_v<MT4> )
13717 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13718 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13719 :( IsUpper_v<MT4> ? i : 0UL ) );
13720 const size_t kend( ( IsUpper_v<MT5> )
13721 ?( ( IsLower_v<MT4> )
13722 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13723 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13724 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
13726 size_t k( kbegin );
13730 SIMDType a1( A.load(i ,k) );
13731 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
13732 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
13733 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
13734 SIMDType b1(
set( B(k,j ) ) );
13735 SIMDType b2(
set( B(k,j+1UL) ) );
13736 SIMDType xmm1( a1 * b1 );
13737 SIMDType xmm2( a2 * b1 );
13738 SIMDType xmm3( a3 * b1 );
13739 SIMDType xmm4( a4 * b1 );
13740 SIMDType xmm5( a1 * b2 );
13741 SIMDType xmm6( a2 * b2 );
13742 SIMDType xmm7( a3 * b2 );
13743 SIMDType xmm8( a4 * b2 );
13745 for( ++k; k<kend; ++k ) {
13750 b1 =
set( B(k,j ) );
13751 b2 =
set( B(k,j+1UL) );
13762 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13766 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
13768 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
13769 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
13775 const size_t kbegin( ( IsLower_v<MT5> )
13776 ?( ( IsUpper_v<MT4> )
13777 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13778 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13779 :( IsUpper_v<MT4> ? i : 0UL ) );
13780 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
13782 size_t k( kbegin );
13786 SIMDType b1(
set( B(k,j) ) );
13787 SIMDType xmm1( A.load(i ,k) * b1 );
13788 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
13789 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
13790 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
13792 for( ++k; k<kend; ++k ) {
13793 b1 =
set( B(k,j) );
13794 xmm1 += A.load(i ,k) * b1;
13795 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
13796 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
13797 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
13800 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13812 for( ; (j+2UL) <= N; j+=2UL )
13814 const size_t kbegin( ( IsLower_v<MT5> )
13815 ?( ( IsUpper_v<MT4> )
13816 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13817 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13818 :( IsUpper_v<MT4> ? i : 0UL ) );
13819 const size_t kend( ( IsUpper_v<MT5> )
13820 ?( ( IsLower_v<MT4> )
13821 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
13822 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
13823 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
13825 size_t k( kbegin );
13829 SIMDType a1( A.load(i ,k) );
13830 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
13831 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
13832 SIMDType b1(
set( B(k,j ) ) );
13833 SIMDType b2(
set( B(k,j+1UL) ) );
13834 SIMDType xmm1( a1 * b1 );
13835 SIMDType xmm2( a2 * b1 );
13836 SIMDType xmm3( a3 * b1 );
13837 SIMDType xmm4( a1 * b2 );
13838 SIMDType xmm5( a2 * b2 );
13839 SIMDType xmm6( a3 * b2 );
13841 for( ++k; k<kend; ++k ) {
13845 b1 =
set( B(k,j ) );
13846 b2 =
set( B(k,j+1UL) );
13855 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13858 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
13860 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
13866 const size_t kbegin( ( IsLower_v<MT5> )
13867 ?( ( IsUpper_v<MT4> )
13868 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13869 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13870 :( IsUpper_v<MT4> ? i : 0UL ) );
13871 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
13873 size_t k( kbegin );
13877 SIMDType b1(
set( B(k,j) ) );
13878 SIMDType xmm1( A.load(i ,k) * b1 );
13879 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
13880 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
13882 for( ++k; k<kend; ++k ) {
13883 b1 =
set( B(k,j) );
13884 xmm1 += A.load(i ,k) * b1;
13885 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
13886 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
13889 C.store( i , j, C.load(i ,j) - xmm1 * factor );
13898 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
13899 size_t j( UPP ? i : 0UL );
13901 for( ; (j+4UL) <= jend; j+=4UL )
13903 const size_t kbegin( ( IsLower_v<MT5> )
13904 ?( ( IsUpper_v<MT4> )
13905 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13906 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13907 :( IsUpper_v<MT4> ? i : 0UL ) );
13908 const size_t kend( ( IsUpper_v<MT5> )
13909 ?( ( IsLower_v<MT4> )
13910 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
13911 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
13912 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
13914 size_t k( kbegin );
13918 SIMDType a1( A.load(i ,k) );
13919 SIMDType a2( A.load(i+
SIMDSIZE,k) );
13920 SIMDType b1(
set( B(k,j ) ) );
13921 SIMDType b2(
set( B(k,j+1UL) ) );
13922 SIMDType b3(
set( B(k,j+2UL) ) );
13923 SIMDType b4(
set( B(k,j+3UL) ) );
13924 SIMDType xmm1( a1 * b1 );
13925 SIMDType xmm2( a2 * b1 );
13926 SIMDType xmm3( a1 * b2 );
13927 SIMDType xmm4( a2 * b2 );
13928 SIMDType xmm5( a1 * b3 );
13929 SIMDType xmm6( a2 * b3 );
13930 SIMDType xmm7( a1 * b4 );
13931 SIMDType xmm8( a2 * b4 );
13933 for( ++k; k<kend; ++k ) {
13936 b1 =
set( B(k,j ) );
13937 b2 =
set( B(k,j+1UL) );
13938 b3 =
set( B(k,j+2UL) );
13939 b4 =
set( B(k,j+3UL) );
13950 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
13952 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
13954 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
13956 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
13961 for( ; (j+3UL) <= jend; j+=3UL )
13963 const size_t kbegin( ( IsLower_v<MT5> )
13964 ?( ( IsUpper_v<MT4> )
13965 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
13966 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
13967 :( IsUpper_v<MT4> ? i : 0UL ) );
13968 const size_t kend( ( IsUpper_v<MT5> )
13969 ?( ( IsLower_v<MT4> )
13970 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
13971 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
13972 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
13974 size_t k( kbegin );
13978 SIMDType a1( A.load(i ,k) );
13979 SIMDType a2( A.load(i+
SIMDSIZE,k) );
13980 SIMDType b1(
set( B(k,j ) ) );
13981 SIMDType b2(
set( B(k,j+1UL) ) );
13982 SIMDType b3(
set( B(k,j+2UL) ) );
13983 SIMDType xmm1( a1 * b1 );
13984 SIMDType xmm2( a2 * b1 );
13985 SIMDType xmm3( a1 * b2 );
13986 SIMDType xmm4( a2 * b2 );
13987 SIMDType xmm5( a1 * b3 );
13988 SIMDType xmm6( a2 * b3 );
13990 for( ++k; k<kend; ++k ) {
13993 b1 =
set( B(k,j ) );
13994 b2 =
set( B(k,j+1UL) );
13995 b3 =
set( B(k,j+2UL) );
14004 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
14006 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
14008 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
14013 for( ; (j+2UL) <= jend; j+=2UL )
14015 const size_t kbegin( ( IsLower_v<MT5> )
14016 ?( ( IsUpper_v<MT4> )
14017 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14018 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14019 :( IsUpper_v<MT4> ? i : 0UL ) );
14020 const size_t kend( ( IsUpper_v<MT5> )
14021 ?( ( IsLower_v<MT4> )
14022 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
14023 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
14024 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
14026 size_t k( kbegin );
14030 SIMDType a1( A.load(i ,k) );
14031 SIMDType a2( A.load(i+
SIMDSIZE,k) );
14032 SIMDType b1(
set( B(k,j ) ) );
14033 SIMDType b2(
set( B(k,j+1UL) ) );
14034 SIMDType xmm1( a1 * b1 );
14035 SIMDType xmm2( a2 * b1 );
14036 SIMDType xmm3( a1 * b2 );
14037 SIMDType xmm4( a2 * b2 );
14039 for( ++k; k<kend; ++k ) {
14042 b1 =
set( B(k,j ) );
14043 b2 =
set( B(k,j+1UL) );
14050 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
14052 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
14059 const size_t kbegin( ( IsLower_v<MT5> )
14060 ?( ( IsUpper_v<MT4> )
14061 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14062 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14063 :( IsUpper_v<MT4> ? i : 0UL ) );
14064 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
14066 size_t k( kbegin );
14070 SIMDType b1(
set( B(k,j) ) );
14071 SIMDType xmm1( A.load(i ,k) * b1 );
14072 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
14074 for( ++k; k<kend; ++k ) {
14075 b1 =
set( B(k,j) );
14076 xmm1 += A.load(i ,k) * b1;
14077 xmm2 += A.load(i+
SIMDSIZE,k) * b1;
14080 C.store( i , j, C.load(i ,j) - xmm1 * factor );
14088 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
14089 size_t j( UPP ? i : 0UL );
14091 for( ; (j+4UL) <= jend; j+=4UL )
14093 const size_t kbegin( ( IsLower_v<MT5> )
14094 ?( ( IsUpper_v<MT4> )
14095 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14096 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14097 :( IsUpper_v<MT4> ? i : 0UL ) );
14098 const size_t kend( ( IsUpper_v<MT5> )
14099 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
14102 size_t k( kbegin );
14106 SIMDType a1( A.load(i,k) );
14107 SIMDType xmm1( a1 *
set( B(k,j ) ) );
14108 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
14109 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
14110 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
14112 for( ++k; k<kend; ++k ) {
14114 xmm1 += a1 *
set( B(k,j ) );
14115 xmm2 += a1 *
set( B(k,j+1UL) );
14116 xmm3 += a1 *
set( B(k,j+2UL) );
14117 xmm4 += a1 *
set( B(k,j+3UL) );
14120 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14121 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14122 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
14123 C.store( i, j+3UL, C.load(i,j+3UL) - xmm4 * factor );
14127 for( ; (j+3UL) <= jend; j+=3UL )
14129 const size_t kbegin( ( IsLower_v<MT5> )
14130 ?( ( IsUpper_v<MT4> )
14131 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14132 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14133 :( IsUpper_v<MT4> ? i : 0UL ) );
14134 const size_t kend( ( IsUpper_v<MT5> )
14135 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
14138 size_t k( kbegin );
14142 SIMDType a1( A.load(i,k) );
14143 SIMDType xmm1( a1 *
set( B(k,j ) ) );
14144 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
14145 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
14147 for( ++k; k<kend; ++k ) {
14149 xmm1 += a1 *
set( B(k,j ) );
14150 xmm2 += a1 *
set( B(k,j+1UL) );
14151 xmm3 += a1 *
set( B(k,j+2UL) );
14154 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14155 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14156 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
14160 for( ; (j+2UL) <= jend; j+=2UL )
14162 const size_t kbegin( ( IsLower_v<MT5> )
14163 ?( ( IsUpper_v<MT4> )
14164 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14165 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14166 :( IsUpper_v<MT4> ? i : 0UL ) );
14167 const size_t kend( ( IsUpper_v<MT5> )
14168 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
14171 size_t k( kbegin );
14175 SIMDType a1( A.load(i,k) );
14176 SIMDType xmm1( a1 *
set( B(k,j ) ) );
14177 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
14179 for( ++k; k<kend; ++k ) {
14181 xmm1 += a1 *
set( B(k,j ) );
14182 xmm2 += a1 *
set( B(k,j+1UL) );
14185 C.store( i, j , C.load(i,j ) - xmm1 * factor );
14186 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
14192 const size_t kbegin( ( IsLower_v<MT5> )
14193 ?( ( IsUpper_v<MT4> )
14194 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14195 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14196 :( IsUpper_v<MT4> ? i : 0UL ) );
14198 size_t k( kbegin );
14202 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
14204 for( ++k; k<K; ++k ) {
14205 xmm1 += A.load(i,k) *
set( B(k,j) );
14208 C.store( i, j, C.load(i,j) - xmm1 * factor );
14213 for( ; remainder && i<M; ++i )
14215 const size_t jend( LOW ? i+1UL : N );
14216 size_t j( UPP ? i : 0UL );
14218 for( ; (j+2UL) <= jend; j+=2UL )
14220 const size_t kbegin( ( IsLower_v<MT5> )
14221 ?( ( IsUpper_v<MT4> )
14222 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14223 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14224 :( IsUpper_v<MT4> ? i : 0UL ) );
14225 const size_t kend( ( IsUpper_v<MT5> )
14226 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
14229 size_t k( kbegin );
14236 for( ++k; k<kend; ++k ) {
14237 value1 += A(i,k) * B(k,j );
14238 value2 += A(i,k) * B(k,j+1UL);
14241 C(i,j ) -= value1 * scalar;
14242 C(i,j+1UL) -= value2 * scalar;
14248 const size_t kbegin( ( IsLower_v<MT5> )
14249 ?( ( IsUpper_v<MT4> )
14250 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
14251 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
14252 :( IsUpper_v<MT4> ? i : 0UL ) );
14254 size_t k( kbegin );
14260 for( ++k; k<K; ++k ) {
14261 value += A(i,k) * B(k,j);
14264 C(i,j) -= value * scalar;
14285 template<
typename MT3
14289 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
14290 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
14292 selectDefaultSubAssignKernel( C, A, B, scalar );
14311 template<
typename MT3
14315 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
14316 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
14319 lmmm( C, A, B, -scalar, ST2(1) );
14321 ummm( C, A, B, -scalar, ST2(1) );
14323 mmm( C, A, B, -scalar, ST2(1) );
14341 template<
typename MT3
14345 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
14346 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
14348 selectLargeSubAssignKernel( C, A, B, scalar );
14353#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
14367 template<
typename MT3
14371 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
14372 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
14374 using ET = ElementType_t<MT3>;
14376 if( IsTriangular_v<MT4> ) {
14377 ResultType_t<MT3> tmp(
serial( B ) );
14378 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
14379 subAssign( C, tmp );
14381 else if( IsTriangular_v<MT5> ) {
14382 ResultType_t<MT3> tmp(
serial( A ) );
14383 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
14384 subAssign( C, tmp );
14387 gemm( C, A, B,
ET(-scalar),
ET(1) );
14409 template<
typename MT
14411 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
14423 schurAssign( *lhs, tmp );
14454 template<
typename MT
14457 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14464 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14465 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14467 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
14470 else if( left.columns() == 0UL ) {
14485 smpAssign( *lhs, A * B * rhs.scalar_ );
14504 template<
typename MT
14507 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14511 using TmpType = If_t< SO, ResultType, OppositeType >;
14523 const ForwardFunctor fwd;
14525 const TmpType tmp( rhs );
14545 template<
typename MT
14548 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14555 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14556 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14558 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
14595 template<
typename MT
14598 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
14605 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
14606 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
14608 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
14642 template<
typename MT
14725template<
typename MT1
14727inline decltype(
auto)
14732 if( (*lhs).columns() != (*rhs).rows() ) {
14737 return ReturnType( *lhs, *rhs );
14775template<
typename MT1
14781inline decltype(
auto)
declsym(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14789 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
14790 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14821template<
typename MT1
14827inline decltype(
auto)
declherm(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14835 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
14836 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14867template<
typename MT1
14873inline decltype(
auto)
decllow(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
14881 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
14882 return ReturnType( dm.leftOperand(), dm.rightOperand() );
14913template<
typename MT1
14918inline decltype(
auto)
declunilow(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
14957template<
typename MT1
14962inline decltype(
auto)
declstrlow(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
15001template<
typename MT1
15007inline decltype(
auto)
declupp(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
15015 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
15016 return ReturnType( dm.leftOperand(), dm.rightOperand() );
15047template<
typename MT1
15052inline decltype(
auto)
decluniupp(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
15091template<
typename MT1
15096inline decltype(
auto)
declstrupp(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
15135template<
typename MT1
15141inline decltype(
auto)
decldiag(
const TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
15149 using ReturnType =
const TDMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
15150 return ReturnType( dm.leftOperand(), dm.rightOperand() );
15166template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
15167struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
15168 :
public Size<MT1,0UL>
15171template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
15172struct Size< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
15173 :
public Size<MT2,1UL>
15189template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
15190struct IsAligned< TDMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
15191 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-dense matrix multiplications.
Definition: TDMatDMatMultExpr.h:148
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:281
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDMatMultExpr.h:465
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDMatMultExpr.h:289
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:278
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:275
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatDMatMultExpr.h:326
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:155
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatDMatMultExpr.h:172
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:268
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:166
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:421
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatDMatMultExpr.h:375
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:152
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatDMatMultExpr.h:171
TDMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatDMatMultExpr class.
Definition: TDMatDMatMultExpr.h:311
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatDMatMultExpr.h:173
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDMatMultExpr.h:272
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:156
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:271
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:153
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDMatMultExpr.h:455
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:477
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatDMatMultExpr.h:478
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDMatMultExpr.h:411
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatDMatMultExpr.h:267
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatDMatMultExpr.h:401
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatDMatMultExpr.h:391
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDMatMultExpr.h:269
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:154
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDMatMultExpr.h:302
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDMatMultExpr.h:433
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDMatMultExpr.h:445
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:161
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatDMatMultExpr.h:151
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDMatMultExpr.h:265
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDMatMultExpr.h:270
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatDMatMultExpr.h:284
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDMatMultExpr.h:296
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatDMatMultExpr.h:170
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
constexpr bool IsSIMDCombinable_v
Auxiliary variable template for the IsSIMDCombinable type trait.
Definition: IsSIMDCombinable.h:137
constexpr bool HasSIMDAdd_v
Auxiliary variable template for the HasSIMDAdd type trait.
Definition: HasSIMDAdd.h:187
constexpr bool HasSIMDMult_v
Auxiliary variable template for the HasSIMDMult type trait.
Definition: HasSIMDMult.h:188
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.