35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 143 template<
typename MT1
149 class TDMatTDMatMultExpr
150 :
public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
151 ,
private Computation
165 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
170 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
174 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
175 static constexpr
bool HERM = ( HF && !( LF || UF ) );
176 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
177 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
187 template<
typename T1,
typename T2,
typename T3 >
188 static constexpr
bool CanExploitSymmetry_v =
189 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
199 template<
typename T1,
typename T2,
typename T3 >
200 static constexpr
bool IsEvaluationRequired_v =
210 template<
typename T1,
typename T2,
typename T3 >
211 static constexpr
bool UseBlasKernel_v =
214 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
215 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
216 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsBLASCompatible_v< ElementType_t<T1> > &&
220 IsBLASCompatible_v< ElementType_t<T2> > &&
221 IsBLASCompatible_v< ElementType_t<T3> > &&
232 template<
typename T1,
typename T2,
typename T3 >
233 static constexpr
bool UseVectorizedDefaultKernel_v =
234 ( useOptimizedKernels &&
235 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
236 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
237 IsSIMDCombinable_v< ElementType_t<T1>
308 ( !IsDiagonal_v<MT1> &&
309 MT1::simdEnabled && MT2::simdEnabled &&
310 HasSIMDAdd_v<ET1,ET2> &&
311 HasSIMDMult_v<ET1,ET2> );
348 if( IsDiagonal_v<MT1> ) {
351 else if( IsDiagonal_v<MT2> ) {
354 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
355 const size_t begin( ( IsUpper_v<MT1> )
356 ?( ( IsLower_v<MT2> )
357 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
358 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
359 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
360 :( ( IsLower_v<MT2> )
361 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
363 const size_t end( ( IsLower_v<MT1> )
364 ?( ( IsUpper_v<MT2> )
365 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
366 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
367 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
368 :( ( IsUpper_v<MT2> )
369 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
370 :(
lhs_.columns() ) ) );
394 if( i >=
lhs_.rows() ) {
397 if( j >=
rhs_.columns() ) {
409 inline size_t rows() const noexcept {
420 return rhs_.columns();
450 template<
typename T >
451 inline bool canAlias(
const T* alias )
const noexcept {
452 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
462 template<
typename T >
463 inline bool isAliased(
const T* alias )
const noexcept {
464 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
474 return lhs_.isAligned() &&
rhs_.isAligned();
485 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
487 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
488 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
489 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
512 template<
typename MT
522 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
525 else if( rhs.lhs_.columns() == 0UL ) {
540 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
556 template<
typename MT3
559 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
561 if( ( IsDiagonal_v<MT4> ) ||
562 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
563 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
564 selectSmallAssignKernel( C, A, B );
566 selectBlasAssignKernel( C, A, B );
585 template<
typename MT3
588 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
589 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
591 const size_t M( A.rows() );
592 const size_t N( B.columns() );
593 const size_t K( A.columns() );
597 for(
size_t j=0UL; j<N; ++j )
599 const size_t kbegin( ( IsLower_v<MT5> )
600 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
602 const size_t kend( ( IsUpper_v<MT5> )
603 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
607 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
608 for(
size_t i=0UL; i<M; ++i ) {
615 const size_t ibegin( ( IsLower_v<MT4> )
616 ?( ( IsStrictlyLower_v<MT4> )
617 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
618 :(
LOW ?
max(j,kbegin) : kbegin ) )
619 :(
LOW ? j : 0UL ) );
620 const size_t iend( ( IsUpper_v<MT4> )
621 ?( ( IsStrictlyUpper_v<MT4> )
622 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
623 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
624 :(
UPP ? j+1UL : M ) );
626 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
627 for(
size_t i=0UL; i<ibegin; ++i ) {
631 else if( IsStrictlyLower_v<MT4> ) {
634 for(
size_t i=ibegin; i<iend; ++i ) {
635 C(i,j) = A(i,kbegin) * B(kbegin,j);
637 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
638 for(
size_t i=iend; i<M; ++i ) {
642 else if( IsStrictlyUpper_v<MT4> ) {
647 for(
size_t k=kbegin+1UL; k<kend; ++k )
649 const size_t ibegin( ( IsLower_v<MT4> )
650 ?( ( IsStrictlyLower_v<MT4> )
654 const size_t iend( ( IsUpper_v<MT4> )
655 ?( ( IsStrictlyUpper_v<MT4> )
656 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
657 :(
UPP ?
min(j+1UL,k) : k ) )
658 :(
UPP ? j+1UL : M ) );
660 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
663 for(
size_t i=ibegin; i<iend; ++i ) {
664 C(i,j) += A(i,k) * B(k,j);
666 if( IsUpper_v<MT4> ) {
667 C(iend,j) = A(iend,k) * B(k,j);
673 for(
size_t j=1UL; j<N; ++j ) {
674 for(
size_t i=0UL; i<j; ++i ) {
675 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
697 template<
typename MT3
700 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
701 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
705 const size_t M( A.rows() );
706 const size_t N( B.columns() );
708 for(
size_t j=0UL; j<N; ++j )
710 const size_t ibegin( ( IsLower_v<MT4> )
711 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
713 const size_t iend( ( IsUpper_v<MT4> )
714 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
718 if( IsLower_v<MT4> ) {
719 for(
size_t i=0UL; i<ibegin; ++i ) {
723 for(
size_t i=ibegin; i<iend; ++i ) {
724 C(i,j) = A(i,j) * B(j,j);
726 if( IsUpper_v<MT4> ) {
727 for(
size_t i=iend; i<M; ++i ) {
750 template<
typename MT3
753 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
754 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
758 const size_t M( A.rows() );
759 const size_t N( B.columns() );
761 for(
size_t j=0UL; j<N; ++j )
763 const size_t ibegin( ( IsLower_v<MT5> )
764 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
766 const size_t iend( ( IsUpper_v<MT5> )
767 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
771 if( IsLower_v<MT4> ) {
772 for(
size_t i=0UL; i<ibegin; ++i ) {
776 for(
size_t i=ibegin; i<iend; ++i ) {
777 C(i,j) = A(i,i) * B(i,j);
779 if( IsUpper_v<MT4> ) {
780 for(
size_t i=iend; i<M; ++i ) {
803 template<
typename MT3
806 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
807 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
813 for(
size_t i=0UL; i<A.rows(); ++i ) {
814 C(i,i) = A(i,i) * B(i,i);
834 template<
typename MT3
837 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
838 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
840 selectDefaultAssignKernel( C, A, B );
860 template<
typename MT3
863 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
864 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
871 const ForwardFunctor fwd;
873 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
874 const OppositeType_t<MT5> tmp(
serial( B ) );
875 assign( C, fwd( A * tmp ) );
877 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
878 const OppositeType_t<MT4> tmp(
serial( A ) );
879 assign( C, fwd( tmp * B ) );
881 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
882 const OppositeType_t<MT5> tmp(
serial( B ) );
883 assign( C, fwd( A * tmp ) );
886 const OppositeType_t<MT4> tmp(
serial( A ) );
887 assign( C, fwd( tmp * B ) );
908 template<
typename MT3
911 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
912 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
914 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
916 const size_t M( A.rows() );
917 const size_t N( B.columns() );
918 const size_t K( A.columns() );
922 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
927 if( IsIntegral_v<ElementType> )
930 for(
size_t j=0UL; j<N; ++j )
932 const size_t kbegin( ( IsLower_v<MT5> )
933 ?( ( IsUpper_v<MT4> )
934 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
935 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
936 :( IsUpper_v<MT4> ? i : 0UL ) );
937 const size_t kend( ( IsUpper_v<MT5> )
938 ?( ( IsLower_v<MT4> )
939 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
940 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
941 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
943 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
945 for(
size_t k=kbegin; k<kend; ++k ) {
947 xmm1 += A.load(i ,k) * b1;
949 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
950 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
951 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
952 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
953 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
954 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
957 C.store( i , j, xmm1 );
973 for( ; (j+2UL) <= N; j+=2UL )
975 const size_t kbegin( ( IsLower_v<MT5> )
976 ?( ( IsUpper_v<MT4> )
977 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
978 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
979 :( IsUpper_v<MT4> ? i : 0UL ) );
980 const size_t kend( ( IsUpper_v<MT5> )
981 ?( ( IsLower_v<MT4> )
982 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
983 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
984 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
986 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
988 for(
size_t k=kbegin; k<kend; ++k ) {
1008 C.store( i , j , xmm1 );
1010 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1011 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1012 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
1013 C.store( i , j+1UL, xmm6 );
1014 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
1015 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
1016 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
1017 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
1022 const size_t kbegin( ( IsLower_v<MT5> )
1023 ?( ( IsUpper_v<MT4> )
1024 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1025 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1026 :( IsUpper_v<MT4> ? i : 0UL ) );
1027 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
1029 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1031 for(
size_t k=kbegin; k<kend; ++k ) {
1033 xmm1 += A.load(i ,k) * b1;
1034 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1035 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1036 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1037 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1040 C.store( i , j, xmm1 );
1042 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1043 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1044 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1056 for(
size_t ii=i; ii<iiend; ++ii ) {
1057 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1064 for(
size_t ii=i; ii<iiend; ++ii ) {
1070 for( ; (j+2UL) <= jend; j+=2UL )
1072 const size_t kbegin( ( IsLower_v<MT5> )
1073 ?( ( IsUpper_v<MT4> )
1074 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1075 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1076 :( IsUpper_v<MT4> ? i : 0UL ) );
1077 const size_t kend( ( IsUpper_v<MT5> )
1078 ?( ( IsLower_v<MT4> )
1079 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1080 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1081 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
1083 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1085 for(
size_t k=kbegin; k<kend; ++k ) {
1102 C.store( i , j , xmm1 );
1104 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1105 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1106 C.store( i , j+1UL, xmm5 );
1107 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
1108 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
1109 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
1114 const size_t kbegin( ( IsLower_v<MT5> )
1115 ?( ( IsUpper_v<MT4> )
1116 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1117 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1118 :( IsUpper_v<MT4> ? i : 0UL ) );
1119 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
1123 for(
size_t k=kbegin; k<kend; ++k ) {
1125 xmm1 += A.load(i ,k) * b1;
1126 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1127 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1128 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1131 C.store( i , j, xmm1 );
1133 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1134 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1142 for(
size_t ii=i; ii<iiend; ++ii ) {
1157 for(
size_t ii=i; ii<iiend; ++ii ) {
1158 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1165 for(
size_t ii=i; ii<iiend; ++ii ) {
1171 for( ; (j+2UL) <= jend; j+=2UL )
1173 const size_t kbegin( ( IsLower_v<MT5> )
1174 ?( ( IsUpper_v<MT4> )
1175 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1176 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1177 :( IsUpper_v<MT4> ? i : 0UL ) );
1178 const size_t kend( ( IsUpper_v<MT5> )
1179 ?( ( IsLower_v<MT4> )
1180 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1181 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1182 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
1184 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1186 for(
size_t k=kbegin; k<kend; ++k ) {
1200 C.store( i , j , xmm1 );
1202 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1203 C.store( i , j+1UL, xmm4 );
1204 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
1205 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
1210 const size_t kbegin( ( IsLower_v<MT5> )
1211 ?( ( IsUpper_v<MT4> )
1212 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1213 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1214 :( IsUpper_v<MT4> ? i : 0UL ) );
1215 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
1219 for(
size_t k=kbegin; k<kend; ++k ) {
1221 xmm1 += A.load(i ,k) * b1;
1222 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1223 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1226 C.store( i , j, xmm1 );
1228 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1236 for(
size_t ii=i; ii<iiend; ++ii ) {
1251 for(
size_t ii=i; ii<iiend; ++ii ) {
1252 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1259 for(
size_t ii=i; ii<iiend; ++ii ) {
1265 for( ; (j+4UL) <= jend; j+=4UL )
1267 const size_t kbegin( ( IsLower_v<MT5> )
1268 ?( ( IsUpper_v<MT4> )
1269 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1270 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1271 :( IsUpper_v<MT4> ? i : 0UL ) );
1272 const size_t kend( ( IsUpper_v<MT5> )
1273 ?( ( IsLower_v<MT4> )
1274 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1275 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1276 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1278 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1280 for(
size_t k=kbegin; k<kend; ++k ) {
1297 C.store( i , j , xmm1 );
1299 C.store( i , j+1UL, xmm3 );
1300 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1301 C.store( i , j+2UL, xmm5 );
1302 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1303 C.store( i , j+3UL, xmm7 );
1304 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
1307 for( ; (j+3UL) <= jend; j+=3UL )
1309 const size_t kbegin( ( IsLower_v<MT5> )
1310 ?( ( IsUpper_v<MT4> )
1311 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1312 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1313 :( IsUpper_v<MT4> ? i : 0UL ) );
1314 const size_t kend( ( IsUpper_v<MT5> )
1315 ?( ( IsLower_v<MT4> )
1316 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1317 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1318 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1320 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1322 for(
size_t k=kbegin; k<kend; ++k ) {
1336 C.store( i , j , xmm1 );
1338 C.store( i , j+1UL, xmm3 );
1339 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1340 C.store( i , j+2UL, xmm5 );
1341 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1344 for( ; (j+2UL) <= jend; j+=2UL )
1346 const size_t kbegin( ( IsLower_v<MT5> )
1347 ?( ( IsUpper_v<MT4> )
1348 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1349 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1350 :( IsUpper_v<MT4> ? i : 0UL ) );
1351 const size_t kend( ( IsUpper_v<MT5> )
1352 ?( ( IsLower_v<MT4> )
1353 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1354 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1355 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1357 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1360 for( ; (k+2UL) <= kend; k+=2UL ) {
1361 const SIMDType a1( A.load(i ,k ) );
1363 const SIMDType a3( A.load(i ,k+1UL) );
1379 for( ; k<kend; ++k ) {
1390 C.store( i , j , xmm1+xmm5 );
1391 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
1392 C.store( i , j+1UL, xmm3+xmm7 );
1393 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
1398 const size_t kbegin( ( IsLower_v<MT5> )
1399 ?( ( IsUpper_v<MT4> )
1400 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1401 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1402 :( IsUpper_v<MT4> ? i : 0UL ) );
1403 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
1408 for( ; (k+2UL) <= kend; k+=2UL ) {
1411 xmm1 += A.load(i ,k ) * b1;
1412 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
1413 xmm3 += A.load(i ,k+1UL) * b2;
1414 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
1417 for( ; k<kend; ++k ) {
1419 xmm1 += A.load(i ,k) * b1;
1423 C.store( i , j, xmm1+xmm3 );
1424 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
1432 for(
size_t ii=i; ii<iiend; ++ii ) {
1447 for(
size_t ii=i; ii<iiend; ++ii ) {
1448 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1455 for(
size_t ii=i; ii<iiend; ++ii ) {
1461 for( ; (j+4UL) <= jend; j+=4UL )
1463 const size_t kbegin( ( IsLower_v<MT5> )
1464 ?( ( IsUpper_v<MT4> )
1465 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1466 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1467 :( IsUpper_v<MT4> ? i : 0UL ) );
1468 const size_t kend( ( IsUpper_v<MT5> )
1469 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1472 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1475 for( ; (k+2UL) <= kend; k+=2UL ) {
1477 const SIMDType a2( A.load(i,k+1UL) );
1478 xmm1 += a1 *
set( B(k ,j ) );
1479 xmm2 += a1 *
set( B(k ,j+1UL) );
1480 xmm3 += a1 *
set( B(k ,j+2UL) );
1481 xmm4 += a1 *
set( B(k ,j+3UL) );
1482 xmm5 += a2 *
set( B(k+1UL,j ) );
1483 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
1484 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
1485 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
1488 for( ; k<kend; ++k ) {
1490 xmm1 += a1 *
set( B(k,j ) );
1491 xmm2 += a1 *
set( B(k,j+1UL) );
1492 xmm3 += a1 *
set( B(k,j+2UL) );
1493 xmm4 += a1 *
set( B(k,j+3UL) );
1496 C.store( i, j , xmm1+xmm5 );
1497 C.store( i, j+1UL, xmm2+xmm6 );
1498 C.store( i, j+2UL, xmm3+xmm7 );
1499 C.store( i, j+3UL, xmm4+xmm8 );
1502 for( ; (j+3UL) <= jend; j+=3UL )
1504 const size_t kbegin( ( IsLower_v<MT5> )
1505 ?( ( IsUpper_v<MT4> )
1506 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1507 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1508 :( IsUpper_v<MT4> ? i : 0UL ) );
1509 const size_t kend( ( IsUpper_v<MT5> )
1510 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1513 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1516 for( ; (k+2UL) <= kend; k+=2UL ) {
1518 const SIMDType a2( A.load(i,k+1UL) );
1519 xmm1 += a1 *
set( B(k ,j ) );
1520 xmm2 += a1 *
set( B(k ,j+1UL) );
1521 xmm3 += a1 *
set( B(k ,j+2UL) );
1522 xmm4 += a2 *
set( B(k+1UL,j ) );
1523 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
1524 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
1527 for( ; k<kend; ++k ) {
1529 xmm1 += a1 *
set( B(k,j ) );
1530 xmm2 += a1 *
set( B(k,j+1UL) );
1531 xmm3 += a1 *
set( B(k,j+2UL) );
1534 C.store( i, j , xmm1+xmm4 );
1535 C.store( i, j+1UL, xmm2+xmm5 );
1536 C.store( i, j+2UL, xmm3+xmm6 );
1539 for( ; (j+2UL) <= jend; j+=2UL )
1541 const size_t kbegin( ( IsLower_v<MT5> )
1542 ?( ( IsUpper_v<MT4> )
1543 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1544 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1545 :( IsUpper_v<MT4> ? i : 0UL ) );
1546 const size_t kend( ( IsUpper_v<MT5> )
1547 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1553 for( ; (k+2UL) <= kend; k+=2UL ) {
1555 const SIMDType a2( A.load(i,k+1UL) );
1556 xmm1 += a1 *
set( B(k ,j ) );
1557 xmm2 += a1 *
set( B(k ,j+1UL) );
1558 xmm3 += a2 *
set( B(k+1UL,j ) );
1559 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
1562 for( ; k<kend; ++k ) {
1564 xmm1 += a1 *
set( B(k,j ) );
1565 xmm2 += a1 *
set( B(k,j+1UL) );
1568 C.store( i, j , xmm1+xmm3 );
1569 C.store( i, j+1UL, xmm2+xmm4 );
1574 const size_t kbegin( ( IsLower_v<MT5> )
1575 ?( ( IsUpper_v<MT4> )
1576 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1577 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1578 :( IsUpper_v<MT4> ? i : 0UL ) );
1583 for( ; (k+2UL) <= K; k+=2UL ) {
1584 xmm1 += A.load(i,k ) *
set( B(k ,j) );
1585 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
1589 xmm1 += A.load(i,k) *
set( B(k,j) );
1592 C.store( i, j, xmm1+xmm2 );
1600 for(
size_t ii=i; ii<iiend; ++ii ) {
1607 for( ; remainder && i<M; ++i )
1613 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1622 for( ; (j+2UL) <= N; j+=2UL )
1624 const size_t kbegin( ( IsLower_v<MT5> )
1625 ?( ( IsUpper_v<MT4> )
1626 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1627 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1628 :( IsUpper_v<MT4> ? i : 0UL ) );
1629 const size_t kend( ( IsUpper_v<MT5> )
1630 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1636 for(
size_t k=kbegin; k<kend; ++k ) {
1637 value1 += A(i,k) * B(k,j );
1638 value2 += A(i,k) * B(k,j+1UL);
1642 C(i,j+1UL) = value2;
1647 const size_t kbegin( ( IsLower_v<MT5> )
1648 ?( ( IsUpper_v<MT4> )
1649 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1650 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1651 :( IsUpper_v<MT4> ? i : 0UL ) );
1655 for(
size_t k=kbegin; k<K; ++k ) {
1656 value += A(i,k) * B(k,j);
1680 template<
typename MT3
1683 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1684 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1686 selectDefaultAssignKernel( C, A, B );
1706 template<
typename MT3
1709 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1710 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1740 template<
typename MT3
1743 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1744 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1746 selectLargeAssignKernel( C, A, B );
1752 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1766 template<
typename MT3
1769 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1770 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1772 using ET = ElementType_t<MT3>;
1774 if( IsTriangular_v<MT4> ) {
1776 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1778 else if( IsTriangular_v<MT5> ) {
1780 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1783 gemm( C, A, B, ET(1), ET(0) );
1803 template<
typename MT
1806 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1810 using TmpType = If_t< SO, ResultType, OppositeType >;
1822 const ForwardFunctor fwd;
1824 const TmpType tmp(
serial( rhs ) );
1825 assign( ~lhs, fwd( tmp ) );
1845 template<
typename MT >
1847 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1856 const ForwardFunctor fwd;
1858 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1859 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1860 else if( IsSymmetric_v<MT1> )
1861 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1863 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1881 template<
typename MT
1883 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
1884 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1891 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1905 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1921 template<
typename MT3
1924 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1926 if( ( IsDiagonal_v<MT4> ) ||
1927 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
1928 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1929 selectSmallAddAssignKernel( C, A, B );
1931 selectBlasAddAssignKernel( C, A, B );
1950 template<
typename MT3
1953 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1954 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1956 const size_t M( A.rows() );
1957 const size_t N( B.columns() );
1958 const size_t K( A.columns() );
1962 for(
size_t j=0UL; j<N; ++j )
1964 const size_t kbegin( ( IsLower_v<MT5> )
1965 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1967 const size_t kend( ( IsUpper_v<MT5> )
1968 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1972 for(
size_t k=kbegin; k<kend; ++k )
1974 const size_t ibegin( ( IsLower_v<MT4> )
1975 ?( ( IsStrictlyLower_v<MT4> )
1976 ?(
LOW ?
max(j,k+1UL) : k+1UL )
1977 :(
LOW ?
max(j,k) : k ) )
1978 :(
LOW ? j : 0UL ) );
1979 const size_t iend( ( IsUpper_v<MT4> )
1980 ?( ( IsStrictlyUpper_v<MT4> )
1981 ?(
UPP ?
min(j+1UL,k) : k )
1982 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
1983 :(
UPP ? j+1UL : M ) );
1985 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
1988 const size_t inum( iend - ibegin );
1989 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1991 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1992 C(i ,j) += A(i ,k) * B(k,j);
1993 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1996 C(ipos,j) += A(ipos,k) * B(k,j);
2018 template<
typename MT3
2021 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2022 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2026 const size_t M( A.rows() );
2027 const size_t N( B.columns() );
2029 for(
size_t j=0UL; j<N; ++j )
2031 const size_t ibegin( ( IsLower_v<MT4> )
2032 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2034 const size_t iend( ( IsUpper_v<MT4> )
2035 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2039 const size_t inum( iend - ibegin );
2040 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2042 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2043 C(i ,j) += A(i ,j) * B(j,j);
2044 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2047 C(ipos,j) += A(ipos,j) * B(j,j);
2068 template<
typename MT3
2071 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2072 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2076 const size_t M( A.rows() );
2077 const size_t N( B.columns() );
2079 for(
size_t j=0UL; j<N; ++j )
2081 const size_t ibegin( ( IsLower_v<MT5> )
2082 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2084 const size_t iend( ( IsUpper_v<MT5> )
2085 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2089 const size_t inum( iend - ibegin );
2090 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2092 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2093 C(i ,j) += A(i ,i ) * B(i ,j);
2094 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2097 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2118 template<
typename MT3
2121 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2122 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2126 for(
size_t i=0UL; i<A.rows(); ++i ) {
2127 C(i,i) += A(i,i) * B(i,i);
2147 template<
typename MT3
2150 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2151 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2153 selectDefaultAddAssignKernel( C, A, B );
2173 template<
typename MT3
2176 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2177 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2184 const ForwardFunctor fwd;
2186 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2187 const OppositeType_t<MT5> tmp(
serial( B ) );
2188 addAssign( C, fwd( A * tmp ) );
2190 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2191 const OppositeType_t<MT4> tmp(
serial( A ) );
2192 addAssign( C, fwd( tmp * B ) );
2194 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2195 const OppositeType_t<MT5> tmp(
serial( B ) );
2196 addAssign( C, fwd( A * tmp ) );
2199 const OppositeType_t<MT4> tmp(
serial( A ) );
2200 addAssign( C, fwd( tmp * B ) );
2221 template<
typename MT3
2224 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2225 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2227 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2229 const size_t M( A.rows() );
2230 const size_t N( B.columns() );
2231 const size_t K( A.columns() );
2235 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
2240 if( IsIntegral_v<ElementType> )
2243 for(
size_t j=0UL; j<N; ++j )
2245 const size_t kbegin( ( IsLower_v<MT5> )
2246 ?( ( IsUpper_v<MT4> )
2247 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2248 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2249 :( IsUpper_v<MT4> ? i : 0UL ) );
2250 const size_t kend( ( IsUpper_v<MT5> )
2251 ?( ( IsLower_v<MT4> )
2252 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2253 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2254 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
2265 for(
size_t k=kbegin; k<kend; ++k ) {
2267 xmm1 += A.load(i ,k) * b1;
2268 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2269 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2270 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2271 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2272 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
2273 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
2274 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
2277 C.store( i , j, xmm1 );
2279 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2280 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2281 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2282 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
2283 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
2284 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
2293 for( ; (j+2UL) <= N; j+=2UL )
2295 const size_t kbegin( ( IsLower_v<MT5> )
2296 ?( ( IsUpper_v<MT4> )
2297 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2298 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2299 :( IsUpper_v<MT4> ? i : 0UL ) );
2300 const size_t kend( ( IsUpper_v<MT5> )
2301 ?( ( IsLower_v<MT4> )
2302 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2303 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2304 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
2311 SIMDType xmm6 ( C.load(i ,j+1UL) );
2317 for(
size_t k=kbegin; k<kend; ++k ) {
2337 C.store( i , j , xmm1 );
2339 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2340 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2341 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
2342 C.store( i , j+1UL, xmm6 );
2343 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
2344 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
2345 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
2346 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
2351 const size_t kbegin( ( IsLower_v<MT5> )
2352 ?( ( IsUpper_v<MT4> )
2353 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2354 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2355 :( IsUpper_v<MT4> ? i : 0UL ) );
2356 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
2364 for(
size_t k=kbegin; k<kend; ++k ) {
2366 xmm1 += A.load(i ,k) * b1;
2367 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2368 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2369 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2370 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2373 C.store( i , j, xmm1 );
2375 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2376 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2377 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2385 for( ; (j+2UL) <= N; j+=2UL )
2387 const size_t kbegin( ( IsLower_v<MT5> )
2388 ?( ( IsUpper_v<MT4> )
2389 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2390 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2391 :( IsUpper_v<MT4> ? i : 0UL ) );
2392 const size_t kend( ( IsUpper_v<MT5> )
2393 ?( ( IsLower_v<MT4> )
2394 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2395 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2396 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
2407 for(
size_t k=kbegin; k<kend; ++k ) {
2424 C.store( i , j , xmm1 );
2426 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2427 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2428 C.store( i , j+1UL, xmm5 );
2429 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
2430 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
2431 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
2436 const size_t kbegin( ( IsLower_v<MT5> )
2437 ?( ( IsUpper_v<MT4> )
2438 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2439 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2440 :( IsUpper_v<MT4> ? i : 0UL ) );
2441 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
2448 for(
size_t k=kbegin; k<kend; ++k ) {
2450 xmm1 += A.load(i ,k) * b1;
2451 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2452 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2453 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2456 C.store( i , j, xmm1 );
2458 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2459 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2467 for( ; (j+2UL) <= N; j+=2UL )
2469 const size_t kbegin( ( IsLower_v<MT5> )
2470 ?( ( IsUpper_v<MT4> )
2471 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2472 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2473 :( IsUpper_v<MT4> ? i : 0UL ) );
2474 const size_t kend( ( IsUpper_v<MT5> )
2475 ?( ( IsLower_v<MT4> )
2476 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2477 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2478 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
2487 for(
size_t k=kbegin; k<kend; ++k ) {
2501 C.store( i , j , xmm1 );
2503 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2504 C.store( i , j+1UL, xmm4 );
2505 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2506 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2511 const size_t kbegin( ( IsLower_v<MT5> )
2512 ?( ( IsUpper_v<MT4> )
2513 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2514 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2515 :( IsUpper_v<MT4> ? i : 0UL ) );
2516 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2522 for(
size_t k=kbegin; k<kend; ++k ) {
2524 xmm1 += A.load(i ,k) * b1;
2525 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2526 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2529 C.store( i , j, xmm1 );
2531 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2538 size_t j(
UPP ? i : 0UL );
2540 for( ; (j+4UL) <= jend; j+=4UL )
2542 const size_t kbegin( ( IsLower_v<MT5> )
2543 ?( ( IsUpper_v<MT4> )
2544 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2545 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2546 :( IsUpper_v<MT4> ? i : 0UL ) );
2547 const size_t kend( ( IsUpper_v<MT5> )
2548 ?( ( IsLower_v<MT4> )
2549 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2550 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2551 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2562 for(
size_t k=kbegin; k<kend; ++k ) {
2579 C.store( i , j , xmm1 );
2581 C.store( i , j+1UL, xmm3 );
2582 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2583 C.store( i , j+2UL, xmm5 );
2584 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2585 C.store( i , j+3UL, xmm7 );
2586 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2589 for( ; (j+3UL) <= jend; j+=3UL )
2591 const size_t kbegin( ( IsLower_v<MT5> )
2592 ?( ( IsUpper_v<MT4> )
2593 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2594 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2595 :( IsUpper_v<MT4> ? i : 0UL ) );
2596 const size_t kend( ( IsUpper_v<MT5> )
2597 ?( ( IsLower_v<MT4> )
2598 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2599 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2600 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2609 for(
size_t k=kbegin; k<kend; ++k ) {
2623 C.store( i , j , xmm1 );
2625 C.store( i , j+1UL, xmm3 );
2626 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2627 C.store( i , j+2UL, xmm5 );
2628 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2631 for( ; (j+2UL) <= jend; j+=2UL )
2633 const size_t kbegin( ( IsLower_v<MT5> )
2634 ?( ( IsUpper_v<MT4> )
2635 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2636 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2637 :( IsUpper_v<MT4> ? i : 0UL ) );
2638 const size_t kend( ( IsUpper_v<MT5> )
2639 ?( ( IsLower_v<MT4> )
2640 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2641 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2642 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2651 for( ; (k+2UL) < kend; k+=2UL ) {
2652 const SIMDType a1( A.load(i ,k ) );
2654 const SIMDType a3( A.load(i ,k+1UL) );
2670 for( ; k<kend; ++k ) {
2681 C.store( i , j , xmm1+xmm5 );
2682 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
2683 C.store( i , j+1UL, xmm3+xmm7 );
2684 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
2689 const size_t kbegin( ( IsLower_v<MT5> )
2690 ?( ( IsUpper_v<MT4> )
2691 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2692 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2693 :( IsUpper_v<MT4> ? i : 0UL ) );
2694 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2701 for( ; (k+2UL) <= kend; k+=2UL ) {
2704 xmm1 += A.load(i ,k ) * b1;
2705 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
2706 xmm3 += A.load(i ,k+1UL) * b2;
2707 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
2710 for( ; k<kend; ++k ) {
2712 xmm1 += A.load(i ,k) * b1;
2716 C.store( i , j, xmm1+xmm3 );
2717 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
2724 size_t j(
UPP ? i : 0UL );
2726 for( ; (j+4UL) <= jend; j+=4UL )
2728 const size_t kbegin( ( IsLower_v<MT5> )
2729 ?( ( IsUpper_v<MT4> )
2730 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2731 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2732 :( IsUpper_v<MT4> ? i : 0UL ) );
2733 const size_t kend( ( IsUpper_v<MT5> )
2734 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2744 for( ; (k+2UL) <= kend; k+=2UL ) {
2746 const SIMDType a2( A.load(i,k+1UL) );
2747 xmm1 += a1 *
set( B(k ,j ) );
2748 xmm2 += a1 *
set( B(k ,j+1UL) );
2749 xmm3 += a1 *
set( B(k ,j+2UL) );
2750 xmm4 += a1 *
set( B(k ,j+3UL) );
2751 xmm5 += a2 *
set( B(k+1UL,j ) );
2752 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2753 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2754 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2757 for( ; k<kend; ++k ) {
2759 xmm1 += a1 *
set( B(k,j ) );
2760 xmm2 += a1 *
set( B(k,j+1UL) );
2761 xmm3 += a1 *
set( B(k,j+2UL) );
2762 xmm4 += a1 *
set( B(k,j+3UL) );
2765 C.store( i, j , xmm1+xmm5 );
2766 C.store( i, j+1UL, xmm2+xmm6 );
2767 C.store( i, j+2UL, xmm3+xmm7 );
2768 C.store( i, j+3UL, xmm4+xmm8 );
2771 for( ; (j+3UL) <= jend; j+=3UL )
2773 const size_t kbegin( ( IsLower_v<MT5> )
2774 ?( ( IsUpper_v<MT4> )
2775 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2776 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2777 :( IsUpper_v<MT4> ? i : 0UL ) );
2778 const size_t kend( ( IsUpper_v<MT5> )
2779 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2788 for( ; (k+2UL) <= kend; k+=2UL ) {
2790 const SIMDType a2( A.load(i,k+1UL) );
2791 xmm1 += a1 *
set( B(k ,j ) );
2792 xmm2 += a1 *
set( B(k ,j+1UL) );
2793 xmm3 += a1 *
set( B(k ,j+2UL) );
2794 xmm4 += a2 *
set( B(k+1UL,j ) );
2795 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2796 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2799 for( ; k<kend; ++k ) {
2801 xmm1 += a1 *
set( B(k,j ) );
2802 xmm2 += a1 *
set( B(k,j+1UL) );
2803 xmm3 += a1 *
set( B(k,j+2UL) );
2806 C.store( i, j , xmm1+xmm4 );
2807 C.store( i, j+1UL, xmm2+xmm5 );
2808 C.store( i, j+2UL, xmm3+xmm6 );
2811 for( ; (j+2UL) <= jend; j+=2UL )
2813 const size_t kbegin( ( IsLower_v<MT5> )
2814 ?( ( IsUpper_v<MT4> )
2815 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2816 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2817 :( IsUpper_v<MT4> ? i : 0UL ) );
2818 const size_t kend( ( IsUpper_v<MT5> )
2819 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2827 for( ; (k+2UL) <= kend; k+=2UL ) {
2829 const SIMDType a2( A.load(i,k+1UL) );
2830 xmm1 += a1 *
set( B(k ,j ) );
2831 xmm2 += a1 *
set( B(k ,j+1UL) );
2832 xmm3 += a2 *
set( B(k+1UL,j ) );
2833 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2836 for( ; k<kend; ++k ) {
2838 xmm1 += a1 *
set( B(k,j ) );
2839 xmm2 += a1 *
set( B(k,j+1UL) );
2842 C.store( i, j , xmm1+xmm3 );
2843 C.store( i, j+1UL, xmm2+xmm4 );
2848 const size_t kbegin( ( IsLower_v<MT5> )
2849 ?( ( IsUpper_v<MT4> )
2850 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2851 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2852 :( IsUpper_v<MT4> ? i : 0UL ) );
2858 for( ; (k+2UL) <= K; k+=2UL ) {
2859 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2860 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2864 xmm1 += A.load(i,k) *
set( B(k,j) );
2867 C.store( i, j, xmm1+xmm2 );
2871 for( ; remainder && i<M; ++i )
2873 const size_t jend(
LOW ? i+1UL : N );
2874 size_t j(
UPP ? i : 0UL );
2876 for( ; (j+2UL) <= jend; j+=2UL )
2878 const size_t kbegin( ( IsLower_v<MT5> )
2879 ?( ( IsUpper_v<MT4> )
2880 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2881 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2882 :( IsUpper_v<MT4> ? i : 0UL ) );
2883 const size_t kend( ( IsUpper_v<MT5> )
2884 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2890 for(
size_t k=kbegin; k<kend; ++k ) {
2891 value1 += A(i,k) * B(k,j );
2892 value2 += A(i,k) * B(k,j+1UL);
2896 C(i,j+1UL) = value2;
2901 const size_t kbegin( ( IsLower_v<MT5> )
2902 ?( ( IsUpper_v<MT4> )
2903 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2904 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2905 :( IsUpper_v<MT4> ? i : 0UL ) );
2909 for(
size_t k=kbegin; k<K; ++k ) {
2910 value += A(i,k) * B(k,j);
2934 template<
typename MT3
2937 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2938 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2940 selectDefaultAddAssignKernel( C, A, B );
2960 template<
typename MT3
2963 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2964 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2990 template<
typename MT3
2993 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2994 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2996 selectLargeAddAssignKernel( C, A, B );
3002 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3016 template<
typename MT3
3019 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3020 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3022 using ET = ElementType_t<MT3>;
3024 if( IsTriangular_v<MT4> ) {
3025 ResultType_t<MT3> tmp(
serial( B ) );
3026 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3027 addAssign( C, tmp );
3029 else if( IsTriangular_v<MT5> ) {
3030 ResultType_t<MT3> tmp(
serial( A ) );
3031 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3032 addAssign( C, tmp );
3035 gemm( C, A, B, ET(1), ET(1) );
3057 template<
typename MT >
3059 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3068 const ForwardFunctor fwd;
3070 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
3071 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
3072 else if( IsSymmetric_v<MT1> )
3073 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
3075 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
3097 template<
typename MT
3099 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
3100 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3107 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3121 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3137 template<
typename MT3
3140 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3142 if( ( IsDiagonal_v<MT4> ) ||
3143 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
3144 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3145 selectSmallSubAssignKernel( C, A, B );
3147 selectBlasSubAssignKernel( C, A, B );
3166 template<
typename MT3
3169 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3170 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3172 const size_t M( A.rows() );
3173 const size_t N( B.columns() );
3174 const size_t K( A.columns() );
3178 for(
size_t j=0UL; j<N; ++j )
3180 const size_t kbegin( ( IsLower_v<MT5> )
3181 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3183 const size_t kend( ( IsUpper_v<MT5> )
3184 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3188 for(
size_t k=kbegin; k<kend; ++k )
3190 const size_t ibegin( ( IsLower_v<MT4> )
3191 ?( ( IsStrictlyLower_v<MT4> )
3192 ?(
LOW ?
max(j,k+1UL) : k+1UL )
3193 :(
LOW ?
max(j,k) : k ) )
3194 :(
LOW ? j : 0UL ) );
3195 const size_t iend( ( IsUpper_v<MT4> )
3196 ?( ( IsStrictlyUpper_v<MT4> )
3197 ?(
UPP ?
min(j+1UL,k) : k )
3198 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
3199 :(
UPP ? j+1UL : M ) );
3201 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
3204 const size_t inum( iend - ibegin );
3205 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3207 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3208 C(i ,j) -= A(i ,k) * B(k,j);
3209 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3212 C(ipos,j) -= A(ipos,k) * B(k,j);
3234 template<
typename MT3
3237 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3238 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3242 const size_t M( A.rows() );
3243 const size_t N( B.columns() );
3245 for(
size_t j=0UL; j<N; ++j )
3247 const size_t ibegin( ( IsLower_v<MT4> )
3248 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3250 const size_t iend( ( IsUpper_v<MT4> )
3251 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3255 const size_t inum( iend - ibegin );
3256 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3258 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3259 C(i ,j) -= A(i ,j) * B(j,j);
3260 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3263 C(ipos,j) -= A(ipos,j) * B(j,j);
3284 template<
typename MT3
3287 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3288 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3292 const size_t M( A.rows() );
3293 const size_t N( B.columns() );
3295 for(
size_t j=0UL; j<N; ++j )
3297 const size_t ibegin( ( IsLower_v<MT5> )
3298 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3300 const size_t iend( ( IsUpper_v<MT5> )
3301 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3305 const size_t inum( iend - ibegin );
3306 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3308 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3309 C(i ,j) -= A(i ,i ) * B(i ,j);
3310 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3313 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3334 template<
typename MT3
3337 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3338 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3342 for(
size_t i=0UL; i<A.rows(); ++i ) {
3343 C(i,i) -= A(i,i) * B(i,i);
3363 template<
typename MT3
3366 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3367 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3369 selectDefaultSubAssignKernel( C, A, B );
3389 template<
typename MT3
3392 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3393 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3400 const ForwardFunctor fwd;
3402 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3403 const OppositeType_t<MT5> tmp(
serial( B ) );
3404 subAssign( C, fwd( A * tmp ) );
3406 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3407 const OppositeType_t<MT4> tmp(
serial( A ) );
3408 subAssign( C, fwd( tmp * B ) );
3410 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3411 const OppositeType_t<MT5> tmp(
serial( B ) );
3412 subAssign( C, fwd( A * tmp ) );
3415 const OppositeType_t<MT4> tmp(
serial( A ) );
3416 subAssign( C, fwd( tmp * B ) );
3437 template<
typename MT3
3440 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3441 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3443 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3445 const size_t M( A.rows() );
3446 const size_t N( B.columns() );
3447 const size_t K( A.columns() );
3451 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
3456 if( IsIntegral_v<ElementType> )
3459 for(
size_t j=0UL; j<N; ++j )
3461 const size_t kbegin( ( IsLower_v<MT5> )
3462 ?( ( IsUpper_v<MT4> )
3463 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3464 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3465 :( IsUpper_v<MT4> ? i : 0UL ) );
3466 const size_t kend( ( IsUpper_v<MT5> )
3467 ?( ( IsLower_v<MT4> )
3468 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3469 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3470 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
3481 for(
size_t k=kbegin; k<kend; ++k ) {
3483 xmm1 -= A.load(i ,k) * b1;
3484 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3485 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3486 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3487 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3488 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
3489 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
3490 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
3493 C.store( i , j, xmm1 );
3495 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3496 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3497 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3498 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
3499 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
3500 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
3509 for( ; (j+2UL) <= N; j+=2UL )
3511 const size_t kbegin( ( IsLower_v<MT5> )
3512 ?( ( IsUpper_v<MT4> )
3513 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3514 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3515 :( IsUpper_v<MT4> ? i : 0UL ) );
3516 const size_t kend( ( IsUpper_v<MT5> )
3517 ?( ( IsLower_v<MT4> )
3518 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3519 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3520 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
3527 SIMDType xmm6 ( C.load(i ,j+1UL) );
3533 for(
size_t k=kbegin; k<kend; ++k ) {
3553 C.store( i , j , xmm1 );
3555 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3556 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3557 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
3558 C.store( i , j+1UL, xmm6 );
3559 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
3560 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
3561 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
3562 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
3567 const size_t kbegin( ( IsLower_v<MT5> )
3568 ?( ( IsUpper_v<MT4> )
3569 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3570 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3571 :( IsUpper_v<MT4> ? i : 0UL ) );
3572 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
3580 for(
size_t k=kbegin; k<kend; ++k ) {
3582 xmm1 -= A.load(i ,k) * b1;
3583 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3584 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3585 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3586 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3589 C.store( i , j, xmm1 );
3591 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3592 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3593 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3601 for( ; (j+2UL) <= N; j+=2UL )
3603 const size_t kbegin( ( IsLower_v<MT5> )
3604 ?( ( IsUpper_v<MT4> )
3605 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3606 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3607 :( IsUpper_v<MT4> ? i : 0UL ) );
3608 const size_t kend( ( IsUpper_v<MT5> )
3609 ?( ( IsLower_v<MT4> )
3610 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3611 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3612 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
3623 for(
size_t k=kbegin; k<kend; ++k ) {
3640 C.store( i , j , xmm1 );
3642 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3643 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3644 C.store( i , j+1UL, xmm5 );
3645 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
3646 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
3647 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
3652 const size_t kbegin( ( IsLower_v<MT5> )
3653 ?( ( IsUpper_v<MT4> )
3654 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3655 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3656 :( IsUpper_v<MT4> ? i : 0UL ) );
3657 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
3664 for(
size_t k=kbegin; k<kend; ++k ) {
3666 xmm1 -= A.load(i ,k) * b1;
3667 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3668 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3669 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3672 C.store( i , j, xmm1 );
3674 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3675 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3683 for( ; (j+2UL) <= N; j+=2UL )
3685 const size_t kbegin( ( IsLower_v<MT5> )
3686 ?( ( IsUpper_v<MT4> )
3687 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3688 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3689 :( IsUpper_v<MT4> ? i : 0UL ) );
3690 const size_t kend( ( IsUpper_v<MT5> )
3691 ?( ( IsLower_v<MT4> )
3692 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3693 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3694 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
3703 for(
size_t k=kbegin; k<kend; ++k ) {
3717 C.store( i , j , xmm1 );
3719 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3720 C.store( i , j+1UL, xmm4 );
3721 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
3722 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
3727 const size_t kbegin( ( IsLower_v<MT5> )
3728 ?( ( IsUpper_v<MT4> )
3729 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3730 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3731 :( IsUpper_v<MT4> ? i : 0UL ) );
3732 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
3738 for(
size_t k=kbegin; k<kend; ++k ) {
3740 xmm1 -= A.load(i ,k) * b1;
3741 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3742 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3745 C.store( i , j, xmm1 );
3747 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3754 size_t j(
UPP ? i : 0UL );
3756 for( ; (j+4UL) <= jend; j+=4UL )
3758 const size_t kbegin( ( IsLower_v<MT5> )
3759 ?( ( IsUpper_v<MT4> )
3760 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3761 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3762 :( IsUpper_v<MT4> ? i : 0UL ) );
3763 const size_t kend( ( IsUpper_v<MT5> )
3764 ?( ( IsLower_v<MT4> )
3765 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3766 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3767 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3778 for(
size_t k=kbegin; k<kend; ++k ) {
3795 C.store( i , j , xmm1 );
3797 C.store( i , j+1UL, xmm3 );
3798 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
3799 C.store( i , j+2UL, xmm5 );
3800 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
3801 C.store( i , j+3UL, xmm7 );
3802 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
3805 for( ; (j+3UL) <= jend; j+=3UL )
3807 const size_t kbegin( ( IsLower_v<MT5> )
3808 ?( ( IsUpper_v<MT4> )
3809 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3810 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3811 :( IsUpper_v<MT4> ? i : 0UL ) );
3812 const size_t kend( ( IsUpper_v<MT5> )
3813 ?( ( IsLower_v<MT4> )
3814 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3815 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3816 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3825 for(
size_t k=kbegin; k<kend; ++k ) {
3839 C.store( i , j , xmm1 );
3841 C.store( i , j+1UL, xmm3 );
3842 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
3843 C.store( i , j+2UL, xmm5 );
3844 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
3847 for( ; (j+2UL) <= jend; j+=2UL )
3849 const size_t kbegin( ( IsLower_v<MT5> )
3850 ?( ( IsUpper_v<MT4> )
3851 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3852 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3853 :( IsUpper_v<MT4> ? i : 0UL ) );
3854 const size_t kend( ( IsUpper_v<MT5> )
3855 ?( ( IsLower_v<MT4> )
3856 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3857 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3858 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3867 for( ; (k+2UL) <= kend; k+=2UL ) {
3868 const SIMDType a1( A.load(i ,k ) );
3870 const SIMDType a3( A.load(i ,k+1UL) );
3886 for( ; k<kend; ++k ) {
3897 C.store( i , j , xmm1+xmm5 );
3898 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
3899 C.store( i , j+1UL, xmm3+xmm7 );
3900 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
3905 const size_t kbegin( ( IsLower_v<MT5> )
3906 ?( ( IsUpper_v<MT4> )
3907 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3908 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3909 :( IsUpper_v<MT4> ? i : 0UL ) );
3910 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
3917 for( ; (k+2UL) <= kend; k+=2UL ) {
3920 xmm1 -= A.load(i ,k ) * b1;
3921 xmm2 -= A.load(i+
SIMDSIZE,k ) * b1;
3922 xmm3 -= A.load(i ,k+1UL) * b2;
3923 xmm4 -= A.load(i+
SIMDSIZE,k+1UL) * b2;
3926 for( ; k<kend; ++k ) {
3928 xmm1 -= A.load(i ,k) * b1;
3932 C.store( i , j, xmm1+xmm3 );
3933 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
3940 size_t j(
UPP ? i : 0UL );
3942 for( ; (j+4UL) <= jend; j+=4UL )
3944 const size_t kbegin( ( IsLower_v<MT5> )
3945 ?( ( IsUpper_v<MT4> )
3946 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3947 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3948 :( IsUpper_v<MT4> ? i : 0UL ) );
3949 const size_t kend( ( IsUpper_v<MT5> )
3950 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
3960 for( ; (k+2UL) <= kend; k+=2UL ) {
3962 const SIMDType a2( A.load(i,k+1UL) );
3963 xmm1 -= a1 *
set( B(k ,j ) );
3964 xmm2 -= a1 *
set( B(k ,j+1UL) );
3965 xmm3 -= a1 *
set( B(k ,j+2UL) );
3966 xmm4 -= a1 *
set( B(k ,j+3UL) );
3967 xmm5 -= a2 *
set( B(k+1UL,j ) );
3968 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
3969 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
3970 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
3973 for( ; k<kend; ++k ) {
3975 xmm1 -= a1 *
set( B(k,j ) );
3976 xmm2 -= a1 *
set( B(k,j+1UL) );
3977 xmm3 -= a1 *
set( B(k,j+2UL) );
3978 xmm4 -= a1 *
set( B(k,j+3UL) );
3981 C.store( i, j , xmm1+xmm5 );
3982 C.store( i, j+1UL, xmm2+xmm6 );
3983 C.store( i, j+2UL, xmm3+xmm7 );
3984 C.store( i, j+3UL, xmm4+xmm8 );
3987 for( ; (j+3UL) <= jend; j+=3UL )
3989 const size_t kbegin( ( IsLower_v<MT5> )
3990 ?( ( IsUpper_v<MT4> )
3991 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3992 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3993 :( IsUpper_v<MT4> ? i : 0UL ) );
3994 const size_t kend( ( IsUpper_v<MT5> )
3995 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4004 for( ; (k+2UL) <= kend; k+=2UL ) {
4006 const SIMDType a2( A.load(i,k+1UL) );
4007 xmm1 -= a1 *
set( B(k ,j ) );
4008 xmm2 -= a1 *
set( B(k ,j+1UL) );
4009 xmm3 -= a1 *
set( B(k ,j+2UL) );
4010 xmm4 -= a2 *
set( B(k+1UL,j ) );
4011 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
4012 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
4015 for( ; k<kend; ++k ) {
4017 xmm1 -= a1 *
set( B(k,j ) );
4018 xmm2 -= a1 *
set( B(k,j+1UL) );
4019 xmm3 -= a1 *
set( B(k,j+2UL) );
4022 C.store( i, j , xmm1+xmm4 );
4023 C.store( i, j+1UL, xmm2+xmm5 );
4024 C.store( i, j+2UL, xmm3+xmm6 );
4027 for( ; (j+2UL) <= jend; j+=2UL )
4029 const size_t kbegin( ( IsLower_v<MT5> )
4030 ?( ( IsUpper_v<MT4> )
4031 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4032 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4033 :( IsUpper_v<MT4> ? i : 0UL ) );
4034 const size_t kend( ( IsUpper_v<MT5> )
4035 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4043 for( ; (k+2UL) <= kend; k+=2UL ) {
4045 const SIMDType a2( A.load(i,k+1UL) );
4046 xmm1 -= a1 *
set( B(k ,j ) );
4047 xmm2 -= a1 *
set( B(k ,j+1UL) );
4048 xmm3 -= a2 *
set( B(k+1UL,j ) );
4049 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
4052 for( ; k<kend; ++k ) {
4054 xmm1 -= a1 *
set( B(k,j ) );
4055 xmm2 -= a1 *
set( B(k,j+1UL) );
4058 C.store( i, j , xmm1+xmm3 );
4059 C.store( i, j+1UL, xmm2+xmm4 );
4064 const size_t kbegin( ( IsLower_v<MT5> )
4065 ?( ( IsUpper_v<MT4> )
4066 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4067 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4068 :( IsUpper_v<MT4> ? i : 0UL ) );
4074 for( ; (k+2UL) <= K; k+=2UL ) {
4075 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
4076 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
4080 xmm1 -= A.load(i,k) *
set( B(k,j) );
4083 C.store( i, j, xmm1+xmm2 );
4087 for( ; remainder && i<M; ++i )
4089 const size_t jend(
LOW ? i+1UL : N );
4090 size_t j(
UPP ? i : 0UL );
4092 for( ; (j+2UL) <= jend; j+=2UL )
4094 const size_t kbegin( ( IsLower_v<MT5> )
4095 ?( ( IsUpper_v<MT4> )
4096 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4097 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4098 :( IsUpper_v<MT4> ? i : 0UL ) );
4099 const size_t kend( ( IsUpper_v<MT5> )
4100 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4106 for(
size_t k=kbegin; k<kend; ++k ) {
4107 value1 -= A(i,k) * B(k,j );
4108 value2 -= A(i,k) * B(k,j+1UL);
4112 C(i,j+1UL) = value2;
4117 const size_t kbegin( ( IsLower_v<MT5> )
4118 ?( ( IsUpper_v<MT4> )
4119 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4120 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4121 :( IsUpper_v<MT4> ? i : 0UL ) );
4125 for(
size_t k=kbegin; k<K; ++k ) {
4126 value -= A(i,k) * B(k,j);
4150 template<
typename MT3
4153 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4154 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4156 selectDefaultSubAssignKernel( C, A, B );
4176 template<
typename MT3
4179 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4180 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4206 template<
typename MT3
4209 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4210 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4212 selectLargeSubAssignKernel( C, A, B );
4218 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4232 template<
typename MT3
4235 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4236 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4238 using ET = ElementType_t<MT3>;
4240 if( IsTriangular_v<MT4> ) {
4241 ResultType_t<MT3> tmp(
serial( B ) );
4242 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4243 subAssign( C, tmp );
4245 else if( IsTriangular_v<MT5> ) {
4246 ResultType_t<MT3> tmp(
serial( A ) );
4247 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4248 subAssign( C, tmp );
4251 gemm( C, A, B, ET(-1), ET(1) );
4274 template<
typename MT >
4276 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4285 const ForwardFunctor fwd;
4287 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4288 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4289 else if( IsSymmetric_v<MT1> )
4290 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4292 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4314 template<
typename MT
4316 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
4328 schurAssign( ~lhs, tmp );
4357 template<
typename MT
4360 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4367 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4370 else if( rhs.lhs_.columns() == 0UL ) {
4406 template<
typename MT
4409 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4413 using TmpType = If_t< SO, ResultType, OppositeType >;
4425 const ForwardFunctor fwd;
4427 const TmpType tmp( rhs );
4448 template<
typename MT >
4450 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4459 const ForwardFunctor fwd;
4461 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4463 else if( IsSymmetric_v<MT1> )
4487 template<
typename MT
4490 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4497 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4532 template<
typename MT >
4534 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4543 const ForwardFunctor fwd;
4545 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4547 else if( IsSymmetric_v<MT1> )
4575 template<
typename MT
4578 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4585 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4620 template<
typename MT >
4622 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4631 const ForwardFunctor fwd;
4633 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4635 else if( IsSymmetric_v<MT1> )
4661 template<
typename MT
4721 template<
typename MT1
4728 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4729 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4730 ,
private Computation
4735 using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4737 using RES = ResultType_t<MMM>;
4738 using RT1 = ResultType_t<MT1>;
4739 using RT2 = ResultType_t<MT2>;
4740 using ET1 = ElementType_t<RT1>;
4741 using ET2 = ElementType_t<RT2>;
4742 using CT1 = CompositeType_t<MT1>;
4743 using CT2 = CompositeType_t<MT2>;
4748 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4753 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4757 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4758 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4759 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4760 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4769 template<
typename T1,
typename T2,
typename T3 >
4770 static constexpr
bool CanExploitSymmetry_v =
4771 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4779 template<
typename T1,
typename T2,
typename T3 >
4780 static constexpr
bool IsEvaluationRequired_v =
4781 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4788 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4789 static constexpr
bool UseBlasKernel_v =
4791 !SYM && !HERM && !LOW && !UPP &&
4792 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4793 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4794 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4795 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4796 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4797 IsBLASCompatible_v< ElementType_t<T1> > &&
4798 IsBLASCompatible_v< ElementType_t<T2> > &&
4799 IsBLASCompatible_v< ElementType_t<T3> > &&
4800 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4801 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4802 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4809 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4810 static constexpr
bool UseVectorizedDefaultKernel_v =
4811 ( useOptimizedKernels &&
4812 !IsDiagonal_v<T2> &&
4813 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4814 IsSIMDCombinable_v< ElementType_t<T1>
4818 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4819 HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4826 using ForwardFunctor =
If_t< HERM
4842 using This = DMatScalarMultExpr<MMM,ST,true>;
4845 using BaseType = DenseMatrix<This,true>;
4849 , DeclHermTrait< MultTrait_t<RES,ST> >
4851 , DeclSymTrait< MultTrait_t<RES,ST> >
4854 , DeclDiagTrait< MultTrait_t<RES,ST> >
4855 , DeclLowTrait< MultTrait_t<RES,ST> > >
4857 , DeclUppTrait< MultTrait_t<RES,ST> >
4858 , MultTrait<RES,ST> > > > >::Type;
4863 using SIMDType = SIMDTrait_t<ElementType>;
4868 using LeftOperand =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4874 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4877 using RT = If_t< evaluateRight, const RT2, CT2 >;
4883 ( !IsDiagonal_v<MT1> &&
4884 MT1::simdEnabled && MT2::simdEnabled &&
4885 IsSIMDCombinable_v<ET1,ET2,ST> &&
4886 HasSIMDAdd_v<ET1,ET2> &&
4887 HasSIMDMult_v<ET1,ET2> );
4891 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4937 if( j >=
matrix_.columns() ) {
4940 return (*
this)(i,j);
4949 inline size_t rows()
const {
4959 inline size_t columns()
const {
4990 template<
typename T >
4991 inline bool canAlias(
const T* alias )
const {
4992 return matrix_.canAlias( alias );
5002 template<
typename T >
5003 inline bool isAliased(
const T* alias )
const {
5004 return matrix_.isAliased( alias );
5025 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5027 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
5028 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
5050 template<
typename MT
5053 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5060 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5061 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5063 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5066 else if( left.columns() == 0UL ) {
5081 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5096 template<
typename MT3
5100 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5102 if( ( IsDiagonal_v<MT4> ) ||
5103 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
5104 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5105 selectSmallAssignKernel( C, A, B, scalar );
5107 selectBlasAssignKernel( C, A, B, scalar );
5125 template<
typename MT3
5129 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5130 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5132 const size_t M( A.rows() );
5133 const size_t N( B.columns() );
5134 const size_t K( A.columns() );
5138 for(
size_t j=0UL; j<N; ++j )
5140 const size_t kbegin( ( IsLower_v<MT5> )
5141 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5143 const size_t kend( ( IsUpper_v<MT5> )
5144 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5148 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5149 for(
size_t i=0UL; i<M; ++i ) {
5156 const size_t ibegin( ( IsLower_v<MT4> )
5157 ?( ( IsStrictlyLower_v<MT4> )
5158 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
5159 :( LOW ?
max(j,kbegin) : kbegin ) )
5160 :( LOW ? j : 0UL ) );
5161 const size_t iend( ( IsUpper_v<MT4> )
5162 ?( ( IsStrictlyUpper_v<MT4> )
5163 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
5164 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
5165 :( UPP ? j+1UL : M ) );
5167 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5168 for(
size_t i=0UL; i<ibegin; ++i ) {
5172 else if( IsStrictlyLower_v<MT4> ) {
5175 for(
size_t i=ibegin; i<iend; ++i ) {
5176 C(i,j) = A(i,kbegin) * B(kbegin,j);
5178 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5179 for(
size_t i=iend; i<M; ++i ) {
5183 else if( IsStrictlyUpper_v<MT4> ) {
5184 reset( C(M-1UL,j) );
5188 for(
size_t k=kbegin+1UL; k<kend; ++k )
5190 const size_t ibegin( ( IsLower_v<MT4> )
5191 ?( ( IsStrictlyLower_v<MT4> )
5192 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
5193 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
5194 :( SYM || HERM || LOW ? j : 0UL ) );
5195 const size_t iend( ( IsUpper_v<MT4> )
5196 ?( ( IsStrictlyUpper_v<MT4> )
5197 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
5198 :( UPP ?
min(j+1UL,k) : k ) )
5199 :( UPP ? j+1UL : M ) );
5201 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5204 for(
size_t i=ibegin; i<iend; ++i ) {
5205 C(i,j) += A(i,k) * B(k,j);
5207 if( IsUpper_v<MT4> ) {
5208 C(iend,j) = A(iend,k) * B(k,j);
5213 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5214 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5215 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5216 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5217 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5218 :( UPP ? j+1UL : M ) );
5220 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5223 for(
size_t i=ibegin; i<iend; ++i ) {
5230 for(
size_t j=1UL; j<N; ++j ) {
5231 for(
size_t i=0UL; i<j; ++i ) {
5232 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5253 template<
typename MT3
5257 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5258 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5262 const size_t M( A.rows() );
5263 const size_t N( B.columns() );
5265 for(
size_t j=0UL; j<N; ++j )
5267 const size_t ibegin( ( IsLower_v<MT4> )
5268 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5270 const size_t iend( ( IsUpper_v<MT4> )
5271 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5275 if( IsLower_v<MT4> ) {
5276 for(
size_t i=0UL; i<ibegin; ++i ) {
5280 for(
size_t i=ibegin; i<iend; ++i ) {
5281 C(i,j) = A(i,j) * B(j,j) * scalar;
5283 if( IsUpper_v<MT4> ) {
5284 for(
size_t i=iend; i<M; ++i ) {
5306 template<
typename MT3
5310 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5311 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5315 const size_t M( A.rows() );
5316 const size_t N( B.columns() );
5318 for(
size_t j=0UL; j<N; ++j )
5320 const size_t ibegin( ( IsLower_v<MT5> )
5321 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5323 const size_t iend( ( IsUpper_v<MT5> )
5324 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5328 if( IsLower_v<MT4> ) {
5329 for(
size_t i=0UL; i<ibegin; ++i ) {
5333 for(
size_t i=ibegin; i<iend; ++i ) {
5334 C(i,j) = A(i,i) * B(i,j) * scalar;
5336 if( IsUpper_v<MT4> ) {
5337 for(
size_t i=iend; i<M; ++i ) {
5359 template<
typename MT3
5363 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5364 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5370 for(
size_t i=0UL; i<A.rows(); ++i ) {
5371 C(i,i) = A(i,i) * B(i,i) * scalar;
5390 template<
typename MT3
5394 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5395 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5397 selectDefaultAssignKernel( C, A, B, scalar );
5416 template<
typename MT3
5420 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5421 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5428 const ForwardFunctor fwd;
5430 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5431 const OppositeType_t<MT5> tmp(
serial( B ) );
5432 assign( C, fwd( A * tmp ) * scalar );
5434 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5435 const OppositeType_t<MT4> tmp(
serial( A ) );
5436 assign( C, fwd( tmp * B ) * scalar );
5438 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5439 const OppositeType_t<MT5> tmp(
serial( B ) );
5440 assign( C, fwd( A * tmp ) * scalar );
5443 const OppositeType_t<MT4> tmp(
serial( A ) );
5444 assign( C, fwd( tmp * B ) * scalar );
5464 template<
typename MT3
5468 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5469 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5471 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5473 const size_t M( A.rows() );
5474 const size_t N( B.columns() );
5475 const size_t K( A.columns() );
5479 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
5482 const SIMDType factor(
set( scalar ) );
5486 if( IsIntegral_v<ElementType> )
5488 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
5489 for(
size_t j=0UL; j<N; ++j )
5491 const size_t kbegin( ( IsLower_v<MT5> )
5492 ?( ( IsUpper_v<MT4> )
5493 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5494 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5495 :( IsUpper_v<MT4> ? i : 0UL ) );
5496 const size_t kend( ( IsUpper_v<MT5> )
5497 ?( ( IsLower_v<MT4> )
5498 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5499 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5500 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
5502 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5504 for(
size_t k=kbegin; k<kend; ++k ) {
5505 const SIMDType b1(
set( B(k,j) ) );
5506 xmm1 += A.load(i ,k) * b1;
5507 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5508 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5509 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5510 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5511 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
5512 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
5513 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
5516 C.store( i , j, xmm1 * factor );
5517 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5518 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5519 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5520 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5521 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
5522 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
5523 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
5528 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
5532 for( ; (j+2UL) <= N; j+=2UL )
5534 const size_t kbegin( ( IsLower_v<MT5> )
5535 ?( ( IsUpper_v<MT4> )
5536 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5537 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5538 :( IsUpper_v<MT4> ? i : 0UL ) );
5539 const size_t kend( ( IsUpper_v<MT5> )
5540 ?( ( IsLower_v<MT4> )
5541 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5542 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5543 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
5545 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5547 for(
size_t k=kbegin; k<kend; ++k ) {
5548 const SIMDType a1( A.load(i ,k) );
5549 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5550 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5551 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5552 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
5553 const SIMDType b1(
set( B(k,j ) ) );
5554 const SIMDType b2(
set( B(k,j+1UL) ) );
5567 C.store( i , j , xmm1 * factor );
5568 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5569 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5570 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5571 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
5572 C.store( i , j+1UL, xmm6 * factor );
5573 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
5574 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5575 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5576 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5581 const size_t kbegin( ( IsLower_v<MT5> )
5582 ?( ( IsUpper_v<MT4> )
5583 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5584 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5585 :( IsUpper_v<MT4> ? i : 0UL ) );
5586 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
5588 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5590 for(
size_t k=kbegin; k<kend; ++k ) {
5591 const SIMDType b1(
set( B(k,j) ) );
5592 xmm1 += A.load(i ,k) * b1;
5593 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5594 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5595 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5596 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5599 C.store( i , j, xmm1 * factor );
5600 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5601 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5602 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5603 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5609 const size_t jend( LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
5615 for(
size_t ii=i; ii<iiend; ++ii ) {
5616 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
5623 for(
size_t ii=i; ii<iiend; ++ii ) {
5629 for( ; (j+2UL) <= jend; j+=2UL )
5631 const size_t kbegin( ( IsLower_v<MT5> )
5632 ?( ( IsUpper_v<MT4> )
5633 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5634 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5635 :( IsUpper_v<MT4> ? i : 0UL ) );
5636 const size_t kend( ( IsUpper_v<MT5> )
5637 ?( ( IsLower_v<MT4> )
5638 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5639 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5640 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
5642 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5644 for(
size_t k=kbegin; k<kend; ++k ) {
5645 const SIMDType a1( A.load(i ,k) );
5646 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5647 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5648 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5649 const SIMDType b1(
set( B(k,j ) ) );
5650 const SIMDType b2(
set( B(k,j+1UL) ) );
5661 C.store( i , j , xmm1 * factor );
5662 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5663 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5664 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5665 C.store( i , j+1UL, xmm5 * factor );
5666 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
5667 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5668 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5673 const size_t kbegin( ( IsLower_v<MT5> )
5674 ?( ( IsUpper_v<MT4> )
5675 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5676 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5677 :( IsUpper_v<MT4> ? i : 0UL ) );
5678 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
5680 SIMDType xmm1, xmm2, xmm3, xmm4;
5682 for(
size_t k=kbegin; k<kend; ++k ) {
5683 const SIMDType b1(
set( B(k,j) ) );
5684 xmm1 += A.load(i ,k) * b1;
5685 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5686 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5687 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5690 C.store( i , j, xmm1 * factor );
5691 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5692 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5693 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5701 for(
size_t ii=i; ii<iiend; ++ii ) {
5710 const size_t jend( LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
5716 for(
size_t ii=i; ii<iiend; ++ii ) {
5717 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
5724 for(
size_t ii=i; ii<iiend; ++ii ) {
5730 for( ; (j+2UL) <= jend; j+=2UL )
5732 const size_t kbegin( ( IsLower_v<MT5> )
5733 ?( ( IsUpper_v<MT4> )
5734 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5735 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5736 :( IsUpper_v<MT4> ? i : 0UL ) );
5737 const size_t kend( ( IsUpper_v<MT5> )
5738 ?( ( IsLower_v<MT4> )
5739 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5740 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5741 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
5743 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5745 for(
size_t k=kbegin; k<kend; ++k ) {
5746 const SIMDType a1( A.load(i ,k) );
5747 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5748 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5749 const SIMDType b1(
set( B(k,j ) ) );
5750 const SIMDType b2(
set( B(k,j+1UL) ) );
5759 C.store( i , j , xmm1 * factor );
5760 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5761 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5762 C.store( i , j+1UL, xmm4 * factor );
5763 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
5764 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5769 const size_t kbegin( ( IsLower_v<MT5> )
5770 ?( ( IsUpper_v<MT4> )
5771 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5772 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5773 :( IsUpper_v<MT4> ? i : 0UL ) );
5774 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
5776 SIMDType xmm1, xmm2, xmm3;
5778 for(
size_t k=kbegin; k<kend; ++k ) {
5779 const SIMDType b1(
set( B(k,j) ) );
5780 xmm1 += A.load(i ,k) * b1;
5781 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5782 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5785 C.store( i , j, xmm1 * factor );
5786 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5787 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5795 for(
size_t ii=i; ii<iiend; ++ii ) {
5804 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
5810 for(
size_t ii=i; ii<iiend; ++ii ) {
5811 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
5818 for(
size_t ii=i; ii<iiend; ++ii ) {
5824 for( ; (j+4UL) <= jend; j+=4UL )
5826 const size_t kbegin( ( IsLower_v<MT5> )
5827 ?( ( IsUpper_v<MT4> )
5828 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5829 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5830 :( IsUpper_v<MT4> ? i : 0UL ) );
5831 const size_t kend( ( IsUpper_v<MT5> )
5832 ?( ( IsLower_v<MT4> )
5833 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
5834 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
5835 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5837 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5839 for(
size_t k=kbegin; k<kend; ++k ) {
5840 const SIMDType a1( A.load(i ,k) );
5841 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5842 const SIMDType b1(
set( B(k,j ) ) );
5843 const SIMDType b2(
set( B(k,j+1UL) ) );
5844 const SIMDType b3(
set( B(k,j+2UL) ) );
5845 const SIMDType b4(
set( B(k,j+3UL) ) );
5856 C.store( i , j , xmm1 * factor );
5857 C.store( i+
SIMDSIZE, j , xmm2 * factor );
5858 C.store( i , j+1UL, xmm3 * factor );
5859 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
5860 C.store( i , j+2UL, xmm5 * factor );
5861 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
5862 C.store( i , j+3UL, xmm7 * factor );
5863 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
5866 for( ; (j+3UL) <= jend; j+=3UL )
5868 const size_t kbegin( ( IsLower_v<MT5> )
5869 ?( ( IsUpper_v<MT4> )
5870 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5871 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5872 :( IsUpper_v<MT4> ? i : 0UL ) );
5873 const size_t kend( ( IsUpper_v<MT5> )
5874 ?( ( IsLower_v<MT4> )
5875 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
5876 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
5877 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5879 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5881 for(
size_t k=kbegin; k<kend; ++k ) {
5882 const SIMDType a1( A.load(i ,k) );
5883 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5884 const SIMDType b1(
set( B(k,j ) ) );
5885 const SIMDType b2(
set( B(k,j+1UL) ) );
5886 const SIMDType b3(
set( B(k,j+2UL) ) );
5895 C.store( i , j , xmm1 * factor );
5896 C.store( i+
SIMDSIZE, j , xmm2 * factor );
5897 C.store( i , j+1UL, xmm3 * factor );
5898 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
5899 C.store( i , j+2UL, xmm5 * factor );
5900 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
5903 for( ; (j+2UL) <= jend; j+=2UL )
5905 const size_t kbegin( ( IsLower_v<MT5> )
5906 ?( ( IsUpper_v<MT4> )
5907 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5908 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5909 :( IsUpper_v<MT4> ? i : 0UL ) );
5910 const size_t kend( ( IsUpper_v<MT5> )
5911 ?( ( IsLower_v<MT4> )
5912 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5913 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5914 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5916 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5919 for( ; (k+2UL) <= kend; k+=2UL ) {
5920 const SIMDType a1( A.load(i ,k ) );
5921 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
5922 const SIMDType a3( A.load(i ,k+1UL) );
5923 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
5924 const SIMDType b1(
set( B(k ,j ) ) );
5925 const SIMDType b2(
set( B(k ,j+1UL) ) );
5926 const SIMDType b3(
set( B(k+1UL,j ) ) );
5927 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
5938 for( ; k<kend; ++k ) {
5939 const SIMDType a1( A.load(i ,k) );
5940 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5941 const SIMDType b1(
set( B(k,j ) ) );
5942 const SIMDType b2(
set( B(k,j+1UL) ) );
5949 C.store( i , j , (xmm1+xmm5) * factor );
5950 C.store( i+
SIMDSIZE, j , (xmm2+xmm6) * factor );
5951 C.store( i , j+1UL, (xmm3+xmm7) * factor );
5952 C.store( i+
SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5957 const size_t kbegin( ( IsLower_v<MT5> )
5958 ?( ( IsUpper_v<MT4> )
5959 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5960 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5961 :( IsUpper_v<MT4> ? i : 0UL ) );
5962 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
5964 SIMDType xmm1, xmm2, xmm3, xmm4;
5967 for( ; (k+2UL) <= kend; k+=2UL ) {
5968 const SIMDType b1(
set( B(k ,j) ) );
5969 const SIMDType b2(
set( B(k+1UL,j) ) );
5970 xmm1 += A.load(i ,k ) * b1;
5971 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
5972 xmm3 += A.load(i ,k+1UL) * b2;
5973 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
5976 for( ; k<kend; ++k ) {
5977 const SIMDType b1(
set( B(k,j) ) );
5978 xmm1 += A.load(i ,k) * b1;
5982 C.store( i , j, (xmm1+xmm3) * factor );
5983 C.store( i+
SIMDSIZE, j, (xmm2+xmm4) * factor );
5991 for(
size_t ii=i; ii<iiend; ++ii ) {
6006 for(
size_t ii=i; ii<iiend; ++ii ) {
6007 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
6014 for(
size_t ii=i; ii<iiend; ++ii ) {
6020 for( ; (j+4UL) <= jend; j+=4UL )
6022 const size_t kbegin( ( IsLower_v<MT5> )
6023 ?( ( IsUpper_v<MT4> )
6024 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6025 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6026 :( IsUpper_v<MT4> ? i : 0UL ) );
6027 const size_t kend( ( IsUpper_v<MT5> )
6028 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6031 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6034 for( ; (k+2UL) <= kend; k+=2UL ) {
6035 const SIMDType a1( A.load(i,k ) );
6036 const SIMDType a2( A.load(i,k+1UL) );
6037 xmm1 += a1 *
set( B(k ,j ) );
6038 xmm2 += a1 *
set( B(k ,j+1UL) );
6039 xmm3 += a1 *
set( B(k ,j+2UL) );
6040 xmm4 += a1 *
set( B(k ,j+3UL) );
6041 xmm5 += a2 *
set( B(k+1UL,j ) );
6042 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
6043 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
6044 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
6047 for( ; k<kend; ++k ) {
6048 const SIMDType a1( A.load(i,k) );
6049 xmm1 += a1 *
set( B(k,j ) );
6050 xmm2 += a1 *
set( B(k,j+1UL) );
6051 xmm3 += a1 *
set( B(k,j+2UL) );
6052 xmm4 += a1 *
set( B(k,j+3UL) );
6055 C.store( i, j , (xmm1+xmm5) * factor );
6056 C.store( i, j+1UL, (xmm2+xmm6) * factor );
6057 C.store( i, j+2UL, (xmm3+xmm7) * factor );
6058 C.store( i, j+3UL, (xmm4+xmm8) * factor );
6061 for( ; (j+3UL) <= jend; j+=3UL )
6063 const size_t kbegin( ( IsLower_v<MT5> )
6064 ?( ( IsUpper_v<MT4> )
6065 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6066 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6067 :( IsUpper_v<MT4> ? i : 0UL ) );
6068 const size_t kend( ( IsUpper_v<MT5> )
6069 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6072 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6075 for( ; (k+2UL) <= kend; k+=2UL ) {
6076 const SIMDType a1( A.load(i,k ) );
6077 const SIMDType a2( A.load(i,k+1UL) );
6078 xmm1 += a1 *
set( B(k ,j ) );
6079 xmm2 += a1 *
set( B(k ,j+1UL) );
6080 xmm3 += a1 *
set( B(k ,j+2UL) );
6081 xmm4 += a2 *
set( B(k+1UL,j ) );
6082 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
6083 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
6086 for( ; k<kend; ++k ) {
6087 const SIMDType a1( A.load(i,k) );
6088 xmm1 += a1 *
set( B(k,j ) );
6089 xmm2 += a1 *
set( B(k,j+1UL) );
6090 xmm3 += a1 *
set( B(k,j+2UL) );
6093 C.store( i, j , (xmm1+xmm4) * factor );
6094 C.store( i, j+1UL, (xmm2+xmm5) * factor );
6095 C.store( i, j+2UL, (xmm3+xmm6) * factor );
6098 for( ; (j+2UL) <= jend; j+=2UL )
6100 const size_t kbegin( ( IsLower_v<MT5> )
6101 ?( ( IsUpper_v<MT4> )
6102 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6103 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6104 :( IsUpper_v<MT4> ? i : 0UL ) );
6105 const size_t kend( ( IsUpper_v<MT5> )
6106 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6109 SIMDType xmm1, xmm2, xmm3, xmm4;
6112 for( ; k<kend; ++k ) {
6113 const SIMDType a1( A.load(i,k) );
6114 xmm1 += a1 *
set( B(k,j ) );
6115 xmm2 += a1 *
set( B(k,j+1UL) );
6118 for( ; (k+2UL) <= kend; k+=2UL ) {
6119 const SIMDType a1( A.load(i,k ) );
6120 const SIMDType a2( A.load(i,k+1UL) );
6121 xmm1 += a1 *
set( B(k ,j ) );
6122 xmm2 += a1 *
set( B(k ,j+1UL) );
6123 xmm3 += a2 *
set( B(k+1UL,j ) );
6124 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
6127 C.store( i, j , (xmm1+xmm3) * factor );
6128 C.store( i, j+1UL, (xmm2+xmm4) * factor );
6133 const size_t kbegin( ( IsLower_v<MT5> )
6134 ?( ( IsUpper_v<MT4> )
6135 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6136 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6137 :( IsUpper_v<MT4> ? i : 0UL ) );
6139 SIMDType xmm1, xmm2;
6142 for( ; (k+2UL) <= K; k+=2UL ) {
6143 xmm1 += A.load(i,k ) *
set( B(k ,j) );
6144 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
6148 xmm1 += A.load(i,k) *
set( B(k,j) );
6151 C.store( i, j, (xmm1+xmm2) * factor );
6159 for(
size_t ii=i; ii<iiend; ++ii ) {
6166 for( ; remainder && i<M; ++i )
6172 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
6181 for( ; (j+2UL) <= N; j+=2UL )
6183 const size_t kbegin( ( IsLower_v<MT5> )
6184 ?( ( IsUpper_v<MT4> )
6185 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6186 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6187 :( IsUpper_v<MT4> ? i : 0UL ) );
6188 const size_t kend( ( IsUpper_v<MT5> )
6189 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6195 for(
size_t k=kbegin; k<kend; ++k ) {
6196 value1 += A(i,k) * B(k,j );
6197 value2 += A(i,k) * B(k,j+1UL);
6200 C(i,j ) = value1 * scalar;
6201 C(i,j+1UL) = value2 * scalar;
6206 const size_t kbegin( ( IsLower_v<MT5> )
6207 ?( ( IsUpper_v<MT4> )
6208 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6209 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6210 :( IsUpper_v<MT4> ? i : 0UL ) );
6214 for(
size_t k=kbegin; k<K; ++k ) {
6215 value += A(i,k) * B(k,j);
6218 C(i,j) = value * scalar;
6238 template<
typename MT3
6242 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6243 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6245 selectDefaultAssignKernel( C, A, B, scalar );
6264 template<
typename MT3
6268 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6269 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6272 smmm( C, A, B, scalar );
6274 hmmm( C, A, B, scalar );
6276 lmmm( C, A, B, scalar, ST2(0) );
6278 ummm( C, A, B, scalar, ST2(0) );
6280 mmm( C, A, B, scalar, ST2(0) );
6298 template<
typename MT3
6302 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6303 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6305 selectLargeAssignKernel( C, A, B, scalar );
6310 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6324 template<
typename MT3
6328 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6329 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6331 using ET = ElementType_t<MT3>;
6333 if( IsTriangular_v<MT4> ) {
6335 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6337 else if( IsTriangular_v<MT5> ) {
6339 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6342 gemm( C, A, B,
ET(scalar),
ET(0) );
6360 template<
typename MT
6363 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6367 using TmpType = If_t< SO, ResultType, OppositeType >;
6379 const ForwardFunctor fwd;
6381 const TmpType tmp(
serial( rhs ) );
6382 assign( ~lhs, fwd( tmp ) );
6400 template<
typename MT >
6402 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6411 const ForwardFunctor fwd;
6413 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6414 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6416 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6417 assign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
6418 else if( IsSymmetric_v<MT1> )
6419 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
6421 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
6437 template<
typename MT
6439 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6440 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6447 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6448 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6450 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6464 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6479 template<
typename MT3
6483 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6485 if( ( IsDiagonal_v<MT4> ) ||
6486 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
6487 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6488 selectSmallAddAssignKernel( C, A, B, scalar );
6490 selectBlasAddAssignKernel( C, A, B, scalar );
6508 template<
typename MT3
6512 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6513 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6516 addAssign( C, tmp );
6534 template<
typename MT3
6538 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6539 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6543 const size_t M( A.rows() );
6544 const size_t N( B.columns() );
6546 for(
size_t j=0UL; j<N; ++j )
6548 const size_t ibegin( ( IsLower_v<MT4> )
6549 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6551 const size_t iend( ( IsUpper_v<MT4> )
6552 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6556 const size_t inum( iend - ibegin );
6557 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6559 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6560 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6561 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6564 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6584 template<
typename MT3
6588 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6589 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6593 const size_t M( A.rows() );
6594 const size_t N( B.columns() );
6596 for(
size_t j=0UL; j<N; ++j )
6598 const size_t ibegin( ( IsLower_v<MT5> )
6599 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6601 const size_t iend( ( IsUpper_v<MT5> )
6602 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6606 const size_t inum( iend - ibegin );
6607 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6609 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6610 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6611 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6614 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6634 template<
typename MT3
6638 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6639 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6643 for(
size_t i=0UL; i<A.rows(); ++i ) {
6644 C(i,i) += A(i,i) * B(i,i) * scalar;
6663 template<
typename MT3
6667 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6668 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6670 selectDefaultAddAssignKernel( C, A, B, scalar );
6689 template<
typename MT3
6693 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6694 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6701 const ForwardFunctor fwd;
6703 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6704 const OppositeType_t<MT5> tmp(
serial( B ) );
6705 addAssign( C, fwd( A * tmp ) * scalar );
6707 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6708 const OppositeType_t<MT4> tmp(
serial( A ) );
6709 addAssign( C, fwd( tmp * B ) * scalar );
6711 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6712 const OppositeType_t<MT5> tmp(
serial( B ) );
6713 addAssign( C, fwd( A * tmp ) * scalar );
6716 const OppositeType_t<MT4> tmp(
serial( A ) );
6717 addAssign( C, fwd( tmp * B ) * scalar );
6737 template<
typename MT3
6741 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6742 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6744 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6746 const size_t M( A.rows() );
6747 const size_t N( B.columns() );
6748 const size_t K( A.columns() );
6752 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
6755 const SIMDType factor(
set( scalar ) );
6759 if( IsIntegral_v<ElementType> )
6762 for(
size_t j=0UL; j<N; ++j )
6764 const size_t kbegin( ( IsLower_v<MT5> )
6765 ?( ( IsUpper_v<MT4> )
6766 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6767 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6768 :( IsUpper_v<MT4> ? i : 0UL ) );
6769 const size_t kend( ( IsUpper_v<MT5> )
6770 ?( ( IsLower_v<MT4> )
6771 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6772 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6773 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
6775 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6777 for(
size_t k=kbegin; k<kend; ++k ) {
6778 const SIMDType b1(
set( B(k,j) ) );
6779 xmm1 += A.load(i ,k) * b1;
6780 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6781 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6782 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6783 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
6784 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
6785 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
6786 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
6789 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6805 for( ; (j+2UL) <= N; j+=2UL )
6807 const size_t kbegin( ( IsLower_v<MT5> )
6808 ?( ( IsUpper_v<MT4> )
6809 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6810 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6811 :( IsUpper_v<MT4> ? i : 0UL ) );
6812 const size_t kend( ( IsUpper_v<MT5> )
6813 ?( ( IsLower_v<MT4> )
6814 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6815 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6816 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
6818 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6820 for(
size_t k=kbegin; k<kend; ++k ) {
6821 const SIMDType a1( A.load(i ,k) );
6822 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6823 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6824 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
6825 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
6826 const SIMDType b1(
set( B(k,j ) ) );
6827 const SIMDType b2(
set( B(k,j+1UL) ) );
6840 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6845 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
6847 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6848 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6849 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6854 const size_t kbegin( ( IsLower_v<MT5> )
6855 ?( ( IsUpper_v<MT4> )
6856 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6857 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6858 :( IsUpper_v<MT4> ? i : 0UL ) );
6859 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
6861 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6863 for(
size_t k=kbegin; k<kend; ++k ) {
6864 const SIMDType b1(
set( B(k,j) ) );
6865 xmm1 += A.load(i ,k) * b1;
6866 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6867 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6868 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6869 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
6872 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6884 for( ; (j+2UL) <= N; j+=2UL )
6886 const size_t kbegin( ( IsLower_v<MT5> )
6887 ?( ( IsUpper_v<MT4> )
6888 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6889 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6890 :( IsUpper_v<MT4> ? i : 0UL ) );
6891 const size_t kend( ( IsUpper_v<MT5> )
6892 ?( ( IsLower_v<MT4> )
6893 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6894 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6895 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
6897 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6899 for(
size_t k=kbegin; k<kend; ++k ) {
6900 const SIMDType a1( A.load(i ,k) );
6901 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6902 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6903 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
6904 const SIMDType b1(
set( B(k,j ) ) );
6905 const SIMDType b2(
set( B(k,j+1UL) ) );
6916 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6920 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
6922 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6923 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6928 const size_t kbegin( ( IsLower_v<MT5> )
6929 ?( ( IsUpper_v<MT4> )
6930 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6931 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6932 :( IsUpper_v<MT4> ? i : 0UL ) );
6933 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
6935 SIMDType xmm1, xmm2, xmm3, xmm4;
6937 for(
size_t k=kbegin; k<kend; ++k ) {
6938 const SIMDType b1(
set( B(k,j) ) );
6939 xmm1 += A.load(i ,k) * b1;
6940 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6941 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6942 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6945 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6956 for( ; (j+2UL) <= N; j+=2UL )
6958 const size_t kbegin( ( IsLower_v<MT5> )
6959 ?( ( IsUpper_v<MT4> )
6960 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6961 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6962 :( IsUpper_v<MT4> ? i : 0UL ) );
6963 const size_t kend( ( IsUpper_v<MT5> )
6964 ?( ( IsLower_v<MT4> )
6965 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6966 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6967 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
6969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6971 for(
size_t k=kbegin; k<kend; ++k ) {
6972 const SIMDType a1( A.load(i ,k) );
6973 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6974 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6975 const SIMDType b1(
set( B(k,j ) ) );
6976 const SIMDType b2(
set( B(k,j+1UL) ) );
6985 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6988 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
6990 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6995 const size_t kbegin( ( IsLower_v<MT5> )
6996 ?( ( IsUpper_v<MT4> )
6997 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6998 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6999 :( IsUpper_v<MT4> ? i : 0UL ) );
7000 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
7002 SIMDType xmm1, xmm2, xmm3;
7004 for(
size_t k=kbegin; k<kend; ++k ) {
7005 const SIMDType b1(
set( B(k,j) ) );
7006 xmm1 += A.load(i ,k) * b1;
7007 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7008 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7011 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7019 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
7020 size_t j( UPP ? i : 0UL );
7022 for( ; (j+4UL) <= jend; j+=4UL )
7024 const size_t kbegin( ( IsLower_v<MT5> )
7025 ?( ( IsUpper_v<MT4> )
7026 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7027 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7028 :( IsUpper_v<MT4> ? i : 0UL ) );
7029 const size_t kend( ( IsUpper_v<MT5> )
7030 ?( ( IsLower_v<MT4> )
7031 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7032 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7033 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7035 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7037 for(
size_t k=kbegin; k<kend; ++k ) {
7038 const SIMDType a1( A.load(i ,k) );
7039 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
7040 const SIMDType b1(
set( B(k,j ) ) );
7041 const SIMDType b2(
set( B(k,j+1UL) ) );
7042 const SIMDType b3(
set( B(k,j+2UL) ) );
7043 const SIMDType b4(
set( B(k,j+3UL) ) );
7054 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7056 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7058 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7060 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
7064 for( ; (j+3UL) <= jend; j+=3UL )
7066 const size_t kbegin( ( IsLower_v<MT5> )
7067 ?( ( IsUpper_v<MT4> )
7068 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7069 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7070 :( IsUpper_v<MT4> ? i : 0UL ) );
7071 const size_t kend( ( IsUpper_v<MT5> )
7072 ?( ( IsLower_v<MT4> )
7073 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
7074 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
7075 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7077 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7079 for(
size_t k=kbegin; k<kend; ++k ) {
7080 const SIMDType a1( A.load(i ,k) );
7081 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
7082 const SIMDType b1(
set( B(k,j ) ) );
7083 const SIMDType b2(
set( B(k,j+1UL) ) );
7084 const SIMDType b3(
set( B(k,j+2UL) ) );
7093 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7095 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7097 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7101 for( ; (j+2UL) <= jend; j+=2UL )
7103 const size_t kbegin( ( IsLower_v<MT5> )
7104 ?( ( IsUpper_v<MT4> )
7105 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7106 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7107 :( IsUpper_v<MT4> ? i : 0UL ) );
7108 const size_t kend( ( IsUpper_v<MT5> )
7109 ?( ( IsLower_v<MT4> )
7110 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7111 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7112 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7114 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7117 for( ; (k+2UL) <= kend; k+=2UL ) {
7118 const SIMDType a1( A.load(i ,k ) );
7119 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
7120 const SIMDType a3( A.load(i ,k+1UL) );
7121 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
7122 const SIMDType b1(
set( B(k ,j ) ) );
7123 const SIMDType b2(
set( B(k ,j+1UL) ) );
7124 const SIMDType b3(
set( B(k+1UL,j ) ) );
7125 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
7136 for( ; k<kend; ++k ) {
7137 const SIMDType a1( A.load(i ,k) );
7138 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
7139 const SIMDType b1(
set( B(k,j ) ) );
7140 const SIMDType b2(
set( B(k,j+1UL) ) );
7147 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
7149 C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
7150 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
7155 const size_t kbegin( ( IsLower_v<MT5> )
7156 ?( ( IsUpper_v<MT4> )
7157 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7158 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7159 :( IsUpper_v<MT4> ? i : 0UL ) );
7160 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
7162 SIMDType xmm1, xmm2, xmm3, xmm4;
7165 for( ; (k+2UL) <= kend; k+=2UL ) {
7166 const SIMDType b1(
set( B(k ,j) ) );
7167 const SIMDType b2(
set( B(k+1UL,j) ) );
7168 xmm1 += A.load(i ,k ) * b1;
7169 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
7170 xmm3 += A.load(i ,k+1UL) * b2;
7171 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
7174 for( ; k<kend; ++k ) {
7175 const SIMDType b1(
set( B(k,j) ) );
7176 xmm1 += A.load(i ,k) * b1;
7180 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7187 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
7188 size_t j( UPP ? i : 0UL );
7190 for( ; (j+4UL) <= jend; j+=4UL )
7192 const size_t kbegin( ( IsLower_v<MT5> )
7193 ?( ( IsUpper_v<MT4> )
7194 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7195 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7196 :( IsUpper_v<MT4> ? i : 0UL ) );
7197 const size_t kend( ( IsUpper_v<MT5> )
7198 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7201 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7204 for( ; (k+2UL) <= kend; k+=2UL ) {
7205 const SIMDType a1( A.load(i,k ) );
7206 const SIMDType a2( A.load(i,k+1UL) );
7207 xmm1 += a1 *
set( B(k ,j ) );
7208 xmm2 += a1 *
set( B(k ,j+1UL) );
7209 xmm3 += a1 *
set( B(k ,j+2UL) );
7210 xmm4 += a1 *
set( B(k ,j+3UL) );
7211 xmm5 += a2 *
set( B(k+1UL,j ) );
7212 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
7213 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
7214 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
7217 for( ; k<kend; ++k ) {
7218 const SIMDType a1( A.load(i,k) );
7219 xmm1 += a1 *
set( B(k,j ) );
7220 xmm2 += a1 *
set( B(k,j+1UL) );
7221 xmm3 += a1 *
set( B(k,j+2UL) );
7222 xmm4 += a1 *
set( B(k,j+3UL) );
7225 C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
7226 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
7227 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
7228 C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
7231 for( ; (j+3UL) <= jend; j+=3UL )
7233 const size_t kbegin( ( IsLower_v<MT5> )
7234 ?( ( IsUpper_v<MT4> )
7235 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7236 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7237 :( IsUpper_v<MT4> ? i : 0UL ) );
7238 const size_t kend( ( IsUpper_v<MT5> )
7239 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7242 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7245 for( ; (k+2UL) <= kend; k+=2UL ) {
7246 const SIMDType a1( A.load(i,k ) );
7247 const SIMDType a2( A.load(i,k+1UL) );
7248 xmm1 += a1 *
set( B(k ,j ) );
7249 xmm2 += a1 *
set( B(k ,j+1UL) );
7250 xmm3 += a1 *
set( B(k ,j+2UL) );
7251 xmm4 += a2 *
set( B(k+1UL,j ) );
7252 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
7253 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
7256 for( ; k<kend; ++k ) {
7257 const SIMDType a1( A.load(i,k) );
7258 xmm1 += a1 *
set( B(k,j ) );
7259 xmm2 += a1 *
set( B(k,j+1UL) );
7260 xmm3 += a1 *
set( B(k,j+2UL) );
7263 C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
7264 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
7265 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
7268 for( ; (j+2UL) <= jend; j+=2UL )
7270 const size_t kbegin( ( IsLower_v<MT5> )
7271 ?( ( IsUpper_v<MT4> )
7272 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7273 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7274 :( IsUpper_v<MT4> ? i : 0UL ) );
7275 const size_t kend( ( IsUpper_v<MT5> )
7276 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7279 SIMDType xmm1, xmm2, xmm3, xmm4;
7282 for( ; (k+2UL) <= kend; k+=2UL ) {
7283 const SIMDType a1( A.load(i,k ) );
7284 const SIMDType a2( A.load(i,k+1UL) );
7285 xmm1 += a1 *
set( B(k ,j ) );
7286 xmm2 += a1 *
set( B(k ,j+1UL) );
7287 xmm3 += a2 *
set( B(k+1UL,j ) );
7288 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
7291 for( ; k<kend; ++k ) {
7292 const SIMDType a1( A.load(i,k) );
7293 xmm1 += a1 *
set( B(k,j ) );
7294 xmm2 += a1 *
set( B(k,j+1UL) );
7297 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7298 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
7303 const size_t kbegin( ( IsLower_v<MT5> )
7304 ?( ( IsUpper_v<MT4> )
7305 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7306 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7307 :( IsUpper_v<MT4> ? i : 0UL ) );
7309 SIMDType xmm1, xmm2;
7312 for( ; (k+2UL) <= K; k+=2UL ) {
7313 xmm1 += A.load(i,k ) *
set( B(k ,j) );
7314 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
7318 xmm1 += A.load(i,k) *
set( B(k,j) );
7321 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7325 for( ; remainder && i<M; ++i )
7327 const size_t jend( LOW ? i+1UL : N );
7328 size_t j( UPP ? i : 0UL );
7330 for( ; (j+2UL) <= jend; j+=2UL )
7332 const size_t kbegin( ( IsLower_v<MT5> )
7333 ?( ( IsUpper_v<MT4> )
7334 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7335 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7336 :( IsUpper_v<MT4> ? i : 0UL ) );
7337 const size_t kend( ( IsUpper_v<MT5> )
7338 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7344 for(
size_t k=kbegin; k<kend; ++k ) {
7345 value1 += A(i,k) * B(k,j );
7346 value2 += A(i,k) * B(k,j+1UL);
7349 C(i,j ) += value1 * scalar;
7350 C(i,j+1UL) += value2 * scalar;
7355 const size_t kbegin( ( IsLower_v<MT5> )
7356 ?( ( IsUpper_v<MT4> )
7357 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7358 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7359 :( IsUpper_v<MT4> ? i : 0UL ) );
7363 for(
size_t k=kbegin; k<K; ++k ) {
7364 value += A(i,k) * B(k,j);
7367 C(i,j) += value * scalar;
7387 template<
typename MT3
7391 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7392 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7394 selectDefaultAddAssignKernel( C, A, B, scalar );
7413 template<
typename MT3
7417 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7418 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7421 lmmm( C, A, B, scalar, ST2(1) );
7423 ummm( C, A, B, scalar, ST2(1) );
7425 mmm( C, A, B, scalar, ST2(1) );
7444 template<
typename MT3
7448 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7449 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7451 selectLargeAddAssignKernel( C, A, B, scalar );
7456 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7470 template<
typename MT3
7474 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7475 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7477 using ET = ElementType_t<MT3>;
7479 if( IsTriangular_v<MT4> ) {
7480 ResultType_t<MT3> tmp(
serial( B ) );
7481 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7482 addAssign( C, tmp );
7484 else if( IsTriangular_v<MT5> ) {
7485 ResultType_t<MT3> tmp(
serial( A ) );
7486 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7487 addAssign( C, tmp );
7490 gemm( C, A, B,
ET(scalar),
ET(1) );
7511 template<
typename MT >
7513 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7522 const ForwardFunctor fwd;
7524 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7525 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7527 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7528 addAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
7529 else if( IsSymmetric_v<MT1> )
7530 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
7532 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
7552 template<
typename MT
7554 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7555 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7562 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7563 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7565 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7579 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7594 template<
typename MT3
7598 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7600 if( ( IsDiagonal_v<MT4> ) ||
7601 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
7602 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7603 selectSmallSubAssignKernel( C, A, B, scalar );
7605 selectBlasSubAssignKernel( C, A, B, scalar );
7623 template<
typename MT3
7627 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7628 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7631 subAssign( C, tmp );
7649 template<
typename MT3
7653 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7654 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7658 const size_t M( A.rows() );
7659 const size_t N( B.columns() );
7661 for(
size_t j=0UL; j<N; ++j )
7663 const size_t ibegin( ( IsLower_v<MT4> )
7664 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7666 const size_t iend( ( IsUpper_v<MT4> )
7667 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7671 const size_t inum( iend - ibegin );
7672 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7674 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7675 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7676 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7679 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7699 template<
typename MT3
7703 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7704 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7708 const size_t M( A.rows() );
7709 const size_t N( B.columns() );
7711 for(
size_t j=0UL; j<N; ++j )
7713 const size_t ibegin( ( IsLower_v<MT5> )
7714 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7716 const size_t iend( ( IsUpper_v<MT5> )
7717 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7721 const size_t inum( iend - ibegin );
7722 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7724 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7725 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7726 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7729 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7749 template<
typename MT3
7753 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7754 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7758 for(
size_t i=0UL; i<A.rows(); ++i ) {
7759 C(i,i) -= A(i,i) * B(i,i) * scalar;
7778 template<
typename MT3
7782 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7783 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7785 selectDefaultSubAssignKernel( C, A, B, scalar );
7804 template<
typename MT3
7808 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7809 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7816 const ForwardFunctor fwd;
7818 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7819 const OppositeType_t<MT5> tmp(
serial( B ) );
7820 subAssign( C, fwd( A * tmp ) * scalar );
7822 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7823 const OppositeType_t<MT4> tmp(
serial( A ) );
7824 subAssign( C, fwd( tmp * B ) * scalar );
7826 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7827 const OppositeType_t<MT5> tmp(
serial( B ) );
7828 subAssign( C, fwd( A * tmp ) * scalar );
7831 const OppositeType_t<MT4> tmp(
serial( A ) );
7832 subAssign( C, fwd( tmp * B ) * scalar );
7852 template<
typename MT3
7856 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7857 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7859 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7861 const size_t M( A.rows() );
7862 const size_t N( B.columns() );
7863 const size_t K( A.columns() );
7867 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
7870 const SIMDType factor(
set( scalar ) );
7874 if( IsIntegral_v<ElementType> )
7877 for(
size_t j=0UL; j<N; ++j )
7879 const size_t kbegin( ( IsLower_v<MT5> )
7880 ?( ( IsUpper_v<MT4> )
7881 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7882 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7883 :( IsUpper_v<MT4> ? i : 0UL ) );
7884 const size_t kend( ( IsUpper_v<MT5> )
7885 ?( ( IsLower_v<MT4> )
7886 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7887 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7888 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
7890 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7892 for(
size_t k=kbegin; k<kend; ++k ) {
7893 const SIMDType b1(
set( B(k,j) ) );
7894 xmm1 += A.load(i ,k) * b1;
7895 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7896 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7897 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7898 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7899 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
7900 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
7901 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
7904 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7920 for( ; (j+2UL) <= N; j+=2UL )
7922 const size_t kbegin( ( IsLower_v<MT5> )
7923 ?( ( IsUpper_v<MT4> )
7924 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7925 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7926 :( IsUpper_v<MT4> ? i : 0UL ) );
7927 const size_t kend( ( IsUpper_v<MT5> )
7928 ?( ( IsLower_v<MT4> )
7929 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7930 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7931 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
7933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7935 for(
size_t k=kbegin; k<kend; ++k ) {
7936 const SIMDType a1( A.load(i ,k) );
7937 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7938 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7939 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
7940 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
7941 const SIMDType b1(
set( B(k,j ) ) );
7942 const SIMDType b2(
set( B(k,j+1UL) ) );
7955 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7960 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
7962 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7963 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7964 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7969 const size_t kbegin( ( IsLower_v<MT5> )
7970 ?( ( IsUpper_v<MT4> )
7971 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7972 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7973 :( IsUpper_v<MT4> ? i : 0UL ) );
7974 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
7976 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7978 for(
size_t k=kbegin; k<kend; ++k ) {
7979 const SIMDType b1(
set( B(k,j) ) );
7980 xmm1 += A.load(i ,k) * b1;
7981 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7982 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7983 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7984 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7987 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7999 for( ; (j+2UL) <= N; j+=2UL )
8001 const size_t kbegin( ( IsLower_v<MT5> )
8002 ?( ( IsUpper_v<MT4> )
8003 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8004 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8005 :( IsUpper_v<MT4> ? i : 0UL ) );
8006 const size_t kend( ( IsUpper_v<MT5> )
8007 ?( ( IsLower_v<MT4> )
8008 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8009 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8010 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
8012 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8014 for(
size_t k=kbegin; k<kend; ++k ) {
8015 const SIMDType a1( A.load(i ,k) );
8016 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8017 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8018 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8019 const SIMDType b1(
set( B(k,j ) ) );
8020 const SIMDType b2(
set( B(k,j+1UL) ) );
8031 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8035 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
8037 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
8038 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
8043 const size_t kbegin( ( IsLower_v<MT5> )
8044 ?( ( IsUpper_v<MT4> )
8045 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8046 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8047 :( IsUpper_v<MT4> ? i : 0UL ) );
8048 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
8050 SIMDType xmm1, xmm2, xmm3, xmm4;
8052 for(
size_t k=kbegin; k<kend; ++k ) {
8053 const SIMDType b1(
set( B(k,j) ) );
8054 xmm1 += A.load(i ,k) * b1;
8055 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8056 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8057 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8060 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8071 for( ; (j+2UL) <= N; j+=2UL )
8073 const size_t kbegin( ( IsLower_v<MT5> )
8074 ?( ( IsUpper_v<MT4> )
8075 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8076 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8077 :( IsUpper_v<MT4> ? i : 0UL ) );
8078 const size_t kend( ( IsUpper_v<MT5> )
8079 ?( ( IsLower_v<MT4> )
8080 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8081 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8082 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
8084 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8086 for(
size_t k=kbegin; k<kend; ++k ) {
8087 const SIMDType a1( A.load(i ,k) );
8088 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8089 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8090 const SIMDType b1(
set( B(k,j ) ) );
8091 const SIMDType b2(
set( B(k,j+1UL) ) );
8100 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8103 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
8105 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
8110 const size_t kbegin( ( IsLower_v<MT5> )
8111 ?( ( IsUpper_v<MT4> )
8112 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8113 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8114 :( IsUpper_v<MT4> ? i : 0UL ) );
8115 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
8117 SIMDType xmm1, xmm2, xmm3;
8119 for(
size_t k=kbegin; k<kend; ++k ) {
8120 const SIMDType b1(
set( B(k,j) ) );
8121 xmm1 += A.load(i ,k) * b1;
8122 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8123 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8126 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8134 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
8135 size_t j( UPP ? i : 0UL );
8137 for( ; (j+4UL) <= jend; j+=4UL )
8139 const size_t kbegin( ( IsLower_v<MT5> )
8140 ?( ( IsUpper_v<MT4> )
8141 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8142 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8143 :( IsUpper_v<MT4> ? i : 0UL ) );
8144 const size_t kend( ( IsUpper_v<MT5> )
8145 ?( ( IsLower_v<MT4> )
8146 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8147 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8148 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8150 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8152 for(
size_t k=kbegin; k<kend; ++k ) {
8153 const SIMDType a1( A.load(i ,k) );
8154 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8155 const SIMDType b1(
set( B(k,j ) ) );
8156 const SIMDType b2(
set( B(k,j+1UL) ) );
8157 const SIMDType b3(
set( B(k,j+2UL) ) );
8158 const SIMDType b4(
set( B(k,j+3UL) ) );
8169 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8171 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8173 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8175 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
8179 for( ; (j+3UL) <= jend; j+=3UL )
8181 const size_t kbegin( ( IsLower_v<MT5> )
8182 ?( ( IsUpper_v<MT4> )
8183 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8184 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8185 :( IsUpper_v<MT4> ? i : 0UL ) );
8186 const size_t kend( ( IsUpper_v<MT5> )
8187 ?( ( IsLower_v<MT4> )
8188 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8189 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8190 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8192 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8194 for(
size_t k=kbegin; k<kend; ++k ) {
8195 const SIMDType a1( A.load(i ,k) );
8196 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8197 const SIMDType b1(
set( B(k,j ) ) );
8198 const SIMDType b2(
set( B(k,j+1UL) ) );
8199 const SIMDType b3(
set( B(k,j+2UL) ) );
8208 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8210 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8212 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8216 for( ; (j+2UL) <= jend; j+=2UL )
8218 const size_t kbegin( ( IsLower_v<MT5> )
8219 ?( ( IsUpper_v<MT4> )
8220 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8221 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8222 :( IsUpper_v<MT4> ? i : 0UL ) );
8223 const size_t kend( ( IsUpper_v<MT5> )
8224 ?( ( IsLower_v<MT4> )
8225 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8226 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8227 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8229 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8232 for( ; (k+2UL) <= kend; k+=2UL ) {
8233 const SIMDType a1( A.load(i ,k ) );
8234 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
8235 const SIMDType a3( A.load(i ,k+1UL) );
8236 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
8237 const SIMDType b1(
set( B(k ,j ) ) );
8238 const SIMDType b2(
set( B(k ,j+1UL) ) );
8239 const SIMDType b3(
set( B(k+1UL,j ) ) );
8240 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8251 for( ; k<kend; ++k ) {
8252 const SIMDType a1( A.load(i ,k) );
8253 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8254 const SIMDType b1(
set( B(k,j ) ) );
8255 const SIMDType b2(
set( B(k,j+1UL) ) );
8262 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8264 C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
8265 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8270 const size_t kbegin( ( IsLower_v<MT5> )
8271 ?( ( IsUpper_v<MT4> )
8272 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8273 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8274 :( IsUpper_v<MT4> ? i : 0UL ) );
8275 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
8277 SIMDType xmm1, xmm2, xmm3, xmm4;
8280 for( ; (k+2UL) <= kend; k+=2UL ) {
8281 const SIMDType b1(
set( B(k ,j) ) );
8282 const SIMDType b2(
set( B(k+1UL,j) ) );
8283 xmm1 += A.load(i ,k ) * b1;
8284 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
8285 xmm3 += A.load(i ,k+1UL) * b2;
8286 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
8289 for( ; k<kend; ++k ) {
8290 const SIMDType b1(
set( B(k,j) ) );
8291 xmm1 += A.load(i ,k) * b1;
8295 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8302 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
8303 size_t j( UPP ? i : 0UL );
8305 for( ; (j+4UL) <= jend; j+=4UL )
8307 const size_t kbegin( ( IsLower_v<MT5> )
8308 ?( ( IsUpper_v<MT4> )
8309 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8310 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8311 :( IsUpper_v<MT4> ? i : 0UL ) );
8312 const size_t kend( ( IsUpper_v<MT5> )
8313 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8316 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8319 for( ; (k+2UL) <= kend; k+=2UL ) {
8320 const SIMDType a1( A.load(i,k ) );
8321 const SIMDType a2( A.load(i,k+1UL) );
8322 xmm1 += a1 *
set( B(k ,j ) );
8323 xmm2 += a1 *
set( B(k ,j+1UL) );
8324 xmm3 += a1 *
set( B(k ,j+2UL) );
8325 xmm4 += a1 *
set( B(k ,j+3UL) );
8326 xmm5 += a2 *
set( B(k+1UL,j ) );
8327 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8328 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8329 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8332 for( ; k<kend; ++k ) {
8333 const SIMDType a1( A.load(i,k) );
8334 xmm1 += a1 *
set( B(k,j ) );
8335 xmm2 += a1 *
set( B(k,j+1UL) );
8336 xmm3 += a1 *
set( B(k,j+2UL) );
8337 xmm4 += a1 *
set( B(k,j+3UL) );
8340 C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
8341 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
8342 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
8343 C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
8346 for( ; (j+3UL) <= jend; j+=3UL )
8348 const size_t kbegin( ( IsLower_v<MT5> )
8349 ?( ( IsUpper_v<MT4> )
8350 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8351 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8352 :( IsUpper_v<MT4> ? i : 0UL ) );
8353 const size_t kend( ( IsUpper_v<MT5> )
8354 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8357 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8360 for( ; (k+2UL) <= kend; k+=2UL ) {
8361 const SIMDType a1( A.load(i,k ) );
8362 const SIMDType a2( A.load(i,k+1UL) );
8363 xmm1 += a1 *
set( B(k ,j ) );
8364 xmm2 += a1 *
set( B(k ,j+1UL) );
8365 xmm3 += a1 *
set( B(k ,j+2UL) );
8366 xmm4 += a2 *
set( B(k+1UL,j ) );
8367 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8368 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8371 for( ; k<kend; ++k ) {
8372 const SIMDType a1( A.load(i,k) );
8373 xmm1 += a1 *
set( B(k,j ) );
8374 xmm2 += a1 *
set( B(k,j+1UL) );
8375 xmm3 += a1 *
set( B(k,j+2UL) );
8378 C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
8379 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
8380 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
8383 for( ; (j+2UL) <= jend; j+=2UL )
8385 const size_t kbegin( ( IsLower_v<MT5> )
8386 ?( ( IsUpper_v<MT4> )
8387 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8388 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8389 :( IsUpper_v<MT4> ? i : 0UL ) );
8390 const size_t kend( ( IsUpper_v<MT5> )
8391 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8394 SIMDType xmm1, xmm2, xmm3, xmm4;
8397 for( ; (k+2UL) <= kend; k+=2UL ) {
8398 const SIMDType a1( A.load(i,k ) );
8399 const SIMDType a2( A.load(i,k+1UL) );
8400 xmm1 += a1 *
set( B(k ,j ) );
8401 xmm2 += a1 *
set( B(k ,j+1UL) );
8402 xmm3 += a2 *
set( B(k+1UL,j ) );
8403 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
8406 for( ; k<kend; ++k ) {
8407 const SIMDType a1( A.load(i,k) );
8408 xmm1 += a1 *
set( B(k,j ) );
8409 xmm2 += a1 *
set( B(k,j+1UL) );
8412 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8413 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
8418 const size_t kbegin( ( IsLower_v<MT5> )
8419 ?( ( IsUpper_v<MT4> )
8420 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8421 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8422 :( IsUpper_v<MT4> ? i : 0UL ) );
8424 SIMDType xmm1, xmm2;
8427 for( ; (k+2UL) <= K; k+=2UL ) {
8428 xmm1 += A.load(i,k ) *
set( B(k ,j) );
8429 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
8433 xmm1 += A.load(i,k) *
set( B(k,j) );
8436 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8440 for( ; remainder && i<M; ++i )
8442 const size_t jend( LOW ? i+1UL : N );
8443 size_t j( UPP ? i : 0UL );
8445 for( ; (j+2UL) <= jend; j+=2UL )
8447 const size_t kbegin( ( IsLower_v<MT5> )
8448 ?( ( IsUpper_v<MT4> )
8449 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8450 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8451 :( IsUpper_v<MT4> ? i : 0UL ) );
8452 const size_t kend( ( IsUpper_v<MT5> )
8453 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8459 for(
size_t k=kbegin; k<kend; ++k ) {
8460 value1 += A(i,k) * B(k,j );
8461 value2 += A(i,k) * B(k,j+1UL);
8464 C(i,j ) -= value1 * scalar;
8465 C(i,j+1UL) -= value2 * scalar;
8470 const size_t kbegin( ( IsLower_v<MT5> )
8471 ?( ( IsUpper_v<MT4> )
8472 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8473 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8474 :( IsUpper_v<MT4> ? i : 0UL ) );
8478 for(
size_t k=kbegin; k<K; ++k ) {
8479 value += A(i,k) * B(k,j);
8482 C(i,j) -= value * scalar;
8502 template<
typename MT3
8506 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8507 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8509 selectDefaultSubAssignKernel( C, A, B, scalar );
8528 template<
typename MT3
8532 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8533 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8536 lmmm( C, A, B, -scalar, ST2(1) );
8538 ummm( C, A, B, -scalar, ST2(1) );
8540 mmm( C, A, B, -scalar, ST2(1) );
8559 template<
typename MT3
8563 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8564 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8566 selectLargeSubAssignKernel( C, A, B, scalar );
8571 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8585 template<
typename MT3
8589 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8590 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8592 using ET = ElementType_t<MT3>;
8594 if( IsTriangular_v<MT4> ) {
8595 ResultType_t<MT3> tmp(
serial( B ) );
8596 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8597 subAssign( C, tmp );
8599 else if( IsTriangular_v<MT5> ) {
8600 ResultType_t<MT3> tmp(
serial( A ) );
8601 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8602 subAssign( C, tmp );
8605 gemm( C, A, B,
ET(-scalar),
ET(1) );
8625 template<
typename MT >
8627 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8636 const ForwardFunctor fwd;
8638 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8639 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8641 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8642 subAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
8643 else if( IsSymmetric_v<MT1> )
8644 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
8646 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
8666 template<
typename MT
8668 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8680 schurAssign( ~lhs, tmp );
8711 template<
typename MT
8714 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8721 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8722 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8724 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8727 else if( left.columns() == 0UL ) {
8761 template<
typename MT
8764 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8768 using TmpType = If_t< SO, ResultType, OppositeType >;
8780 const ForwardFunctor fwd;
8782 const TmpType tmp( rhs );
8801 template<
typename MT >
8803 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8812 const ForwardFunctor fwd;
8814 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8815 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8817 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8819 else if( IsSymmetric_v<MT1> )
8841 template<
typename MT
8844 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8851 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8852 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8854 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8887 template<
typename MT >
8889 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8898 const ForwardFunctor fwd;
8900 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8901 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8903 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8905 else if( IsSymmetric_v<MT1> )
8931 template<
typename MT
8934 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8941 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8942 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8944 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8977 template<
typename MT >
8979 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8988 const ForwardFunctor fwd;
8990 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8991 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8993 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8995 else if( IsSymmetric_v<MT1> )
9018 template<
typename MT
9098 template<
typename MT1
9100 inline decltype(
auto)
9110 return ReturnType( ~lhs, ~rhs );
9146 template<
typename MT1
9152 inline decltype(
auto)
declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9160 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9161 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9190 template<
typename MT1
9196 inline decltype(
auto)
declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9204 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9205 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9234 template<
typename MT1
9240 inline decltype(
auto)
decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9248 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9249 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9278 template<
typename MT1
9284 inline decltype(
auto)
declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9292 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9293 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9322 template<
typename MT1
9328 inline decltype(
auto)
decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9336 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9337 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9353 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9354 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9355 :
public Size<MT1,0UL>
9358 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9359 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9360 :
public Size<MT2,1UL>
9376 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9377 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9378 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:495
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:483
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:307
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:299
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:287
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:329
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications....
Definition: Forward.h:173
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:429
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:285
Header file for the IsComplexDouble type trait.
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
Constraint on the data type.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:320
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:170
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:439
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:463
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:293
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:419
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type,...
Definition: Symmetric.h:79
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:302
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:283
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:314
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:165
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:393
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:409
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:451
Header file for BLAS general matrix/matrix multiplication functions (gemm)
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:288
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:344
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:473
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:296
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:290
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:289
Header file for the IsExpression type trait class.
Header file for the function trace functionality.