35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_ 144 template<
typename MT1
150 class TDMatTDMatMultExpr
151 :
public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
152 ,
private Computation
166 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
171 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
175 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
176 static constexpr
bool HERM = ( HF && !( LF || UF ) );
177 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
178 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
188 template<
typename T1,
typename T2,
typename T3 >
189 static constexpr
bool CanExploitSymmetry_v =
190 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
200 template<
typename T1,
typename T2,
typename T3 >
201 static constexpr
bool IsEvaluationRequired_v =
211 template<
typename T1,
typename T2,
typename T3 >
212 static constexpr
bool UseBlasKernel_v =
215 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
216 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
217 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
218 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
219 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
220 IsBLASCompatible_v< ElementType_t<T1> > &&
221 IsBLASCompatible_v< ElementType_t<T2> > &&
222 IsBLASCompatible_v< ElementType_t<T3> > &&
233 template<
typename T1,
typename T2,
typename T3 >
234 static constexpr
bool UseVectorizedDefaultKernel_v =
235 ( useOptimizedKernels &&
236 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
237 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
238 IsSIMDCombinable_v< ElementType_t<T1>
309 ( !IsDiagonal_v<MT1> &&
310 MT1::simdEnabled && MT2::simdEnabled &&
311 HasSIMDAdd_v<ET1,ET2> &&
312 HasSIMDMult_v<ET1,ET2> );
349 if( IsDiagonal_v<MT1> ) {
352 else if( IsDiagonal_v<MT2> ) {
355 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
356 const size_t begin( ( IsUpper_v<MT1> )
357 ?( ( IsLower_v<MT2> )
358 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
359 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
360 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
361 :( ( IsLower_v<MT2> )
362 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
364 const size_t end( ( IsLower_v<MT1> )
365 ?( ( IsUpper_v<MT2> )
366 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
367 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
368 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
369 :( ( IsUpper_v<MT2> )
370 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
371 :(
lhs_.columns() ) ) );
395 if( i >=
lhs_.rows() ) {
398 if( j >=
rhs_.columns() ) {
410 inline size_t rows() const noexcept {
421 return rhs_.columns();
451 template<
typename T >
452 inline bool canAlias(
const T* alias )
const noexcept {
453 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
463 template<
typename T >
464 inline bool isAliased(
const T* alias )
const noexcept {
465 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
475 return lhs_.isAligned() &&
rhs_.isAligned();
486 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
488 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
489 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
490 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
513 template<
typename MT
523 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
526 else if( rhs.lhs_.columns() == 0UL ) {
541 TDMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
557 template<
typename MT3
560 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
562 if( ( IsDiagonal_v<MT4> ) ||
563 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
564 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
565 selectSmallAssignKernel( C, A, B );
567 selectBlasAssignKernel( C, A, B );
586 template<
typename MT3
589 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
590 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
592 const size_t M( A.rows() );
593 const size_t N( B.columns() );
594 const size_t K( A.columns() );
598 for(
size_t j=0UL; j<N; ++j )
600 const size_t kbegin( ( IsLower_v<MT5> )
601 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
603 const size_t kend( ( IsUpper_v<MT5> )
604 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
608 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
609 for(
size_t i=0UL; i<M; ++i ) {
616 const size_t ibegin( ( IsLower_v<MT4> )
617 ?( ( IsStrictlyLower_v<MT4> )
618 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
619 :(
LOW ?
max(j,kbegin) : kbegin ) )
620 :(
LOW ? j : 0UL ) );
621 const size_t iend( ( IsUpper_v<MT4> )
622 ?( ( IsStrictlyUpper_v<MT4> )
623 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
624 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
625 :(
UPP ? j+1UL : M ) );
627 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
628 for(
size_t i=0UL; i<ibegin; ++i ) {
632 else if( IsStrictlyLower_v<MT4> ) {
635 for(
size_t i=ibegin; i<iend; ++i ) {
636 C(i,j) = A(i,kbegin) * B(kbegin,j);
638 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
639 for(
size_t i=iend; i<M; ++i ) {
643 else if( IsStrictlyUpper_v<MT4> ) {
648 for(
size_t k=kbegin+1UL; k<kend; ++k )
650 const size_t ibegin( ( IsLower_v<MT4> )
651 ?( ( IsStrictlyLower_v<MT4> )
655 const size_t iend( ( IsUpper_v<MT4> )
656 ?( ( IsStrictlyUpper_v<MT4> )
657 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
658 :(
UPP ?
min(j+1UL,k) : k ) )
659 :(
UPP ? j+1UL : M ) );
661 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
664 for(
size_t i=ibegin; i<iend; ++i ) {
665 C(i,j) += A(i,k) * B(k,j);
667 if( IsUpper_v<MT4> ) {
668 C(iend,j) = A(iend,k) * B(k,j);
674 for(
size_t j=1UL; j<N; ++j ) {
675 for(
size_t i=0UL; i<j; ++i ) {
676 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
698 template<
typename MT3
701 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
702 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
706 const size_t M( A.rows() );
707 const size_t N( B.columns() );
709 for(
size_t j=0UL; j<N; ++j )
711 const size_t ibegin( ( IsLower_v<MT4> )
712 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
714 const size_t iend( ( IsUpper_v<MT4> )
715 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
719 if( IsLower_v<MT4> ) {
720 for(
size_t i=0UL; i<ibegin; ++i ) {
724 for(
size_t i=ibegin; i<iend; ++i ) {
725 C(i,j) = A(i,j) * B(j,j);
727 if( IsUpper_v<MT4> ) {
728 for(
size_t i=iend; i<M; ++i ) {
751 template<
typename MT3
754 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
755 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
759 const size_t M( A.rows() );
760 const size_t N( B.columns() );
762 for(
size_t j=0UL; j<N; ++j )
764 const size_t ibegin( ( IsLower_v<MT5> )
765 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
767 const size_t iend( ( IsUpper_v<MT5> )
768 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
772 if( IsLower_v<MT4> ) {
773 for(
size_t i=0UL; i<ibegin; ++i ) {
777 for(
size_t i=ibegin; i<iend; ++i ) {
778 C(i,j) = A(i,i) * B(i,j);
780 if( IsUpper_v<MT4> ) {
781 for(
size_t i=iend; i<M; ++i ) {
804 template<
typename MT3
807 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
808 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
814 for(
size_t i=0UL; i<A.rows(); ++i ) {
815 C(i,i) = A(i,i) * B(i,i);
835 template<
typename MT3
838 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
839 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
841 selectDefaultAssignKernel( C, A, B );
861 template<
typename MT3
864 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
865 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
872 const ForwardFunctor fwd;
874 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
875 const OppositeType_t<MT5> tmp(
serial( B ) );
876 assign( C, fwd( A * tmp ) );
878 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
879 const OppositeType_t<MT4> tmp(
serial( A ) );
880 assign( C, fwd( tmp * B ) );
882 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
883 const OppositeType_t<MT5> tmp(
serial( B ) );
884 assign( C, fwd( A * tmp ) );
887 const OppositeType_t<MT4> tmp(
serial( A ) );
888 assign( C, fwd( tmp * B ) );
909 template<
typename MT3
912 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
913 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
915 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
917 const size_t M( A.rows() );
918 const size_t N( B.columns() );
919 const size_t K( A.columns() );
923 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
933 if( IsIntegral_v<ElementType> )
936 for(
size_t j=0UL; j<N; ++j )
938 const size_t kbegin( ( IsLower_v<MT5> )
939 ?( ( IsUpper_v<MT4> )
940 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
941 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
942 :( IsUpper_v<MT4> ? i : 0UL ) );
943 const size_t kend( ( IsUpper_v<MT5> )
944 ?( ( IsLower_v<MT4> )
945 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
946 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
947 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
949 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
951 for(
size_t k=kbegin; k<kend; ++k ) {
953 xmm1 += A.load(i ,k) * b1;
955 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
956 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
957 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
958 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
959 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
960 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
963 C.store( i , j, xmm1 );
979 for( ; (j+2UL) <= N; j+=2UL )
981 const size_t kbegin( ( IsLower_v<MT5> )
982 ?( ( IsUpper_v<MT4> )
983 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
984 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
985 :( IsUpper_v<MT4> ? i : 0UL ) );
986 const size_t kend( ( IsUpper_v<MT5> )
987 ?( ( IsLower_v<MT4> )
988 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
989 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
990 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
992 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
994 for(
size_t k=kbegin; k<kend; ++k ) {
1000 const SIMDType b1(
set( B(k,j ) ) );
1001 const SIMDType b2(
set( B(k,j+1UL) ) );
1014 C.store( i , j , xmm1 );
1016 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1017 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1018 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
1019 C.store( i , j+1UL, xmm6 );
1020 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
1021 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
1022 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
1023 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
1028 const size_t kbegin( ( IsLower_v<MT5> )
1029 ?( ( IsUpper_v<MT4> )
1030 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1031 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1032 :( IsUpper_v<MT4> ? i : 0UL ) );
1033 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
1035 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
1037 for(
size_t k=kbegin; k<kend; ++k ) {
1038 const SIMDType b1(
set( B(k,j) ) );
1039 xmm1 += A.load(i ,k) * b1;
1040 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1041 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1042 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1043 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1046 C.store( i , j, xmm1 );
1048 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1049 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1050 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1057 size_t j(
UPP ? i : 0UL );
1059 for( ; (j+2UL) <= jend; j+=2UL )
1061 const size_t kbegin( ( IsLower_v<MT5> )
1062 ?( ( IsUpper_v<MT4> )
1063 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1064 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1065 :( IsUpper_v<MT4> ? i : 0UL ) );
1066 const size_t kend( ( IsUpper_v<MT5> )
1067 ?( ( IsLower_v<MT4> )
1068 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1069 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1070 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
1072 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1074 for(
size_t k=kbegin; k<kend; ++k ) {
1079 const SIMDType b1(
set( B(k,j ) ) );
1080 const SIMDType b2(
set( B(k,j+1UL) ) );
1091 C.store( i , j , xmm1 );
1093 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1094 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1095 C.store( i , j+1UL, xmm5 );
1096 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
1097 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
1098 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
1103 const size_t kbegin( ( IsLower_v<MT5> )
1104 ?( ( IsUpper_v<MT4> )
1105 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1106 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1107 :( IsUpper_v<MT4> ? i : 0UL ) );
1108 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
1112 for(
size_t k=kbegin; k<kend; ++k ) {
1113 const SIMDType b1(
set( B(k,j) ) );
1114 xmm1 += A.load(i ,k) * b1;
1115 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1116 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1117 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1120 C.store( i , j, xmm1 );
1122 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1123 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1130 size_t j(
UPP ? i : 0UL );
1132 for( ; (j+2UL) <= jend; j+=2UL )
1134 const size_t kbegin( ( IsLower_v<MT5> )
1135 ?( ( IsUpper_v<MT4> )
1136 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1137 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1138 :( IsUpper_v<MT4> ? i : 0UL ) );
1139 const size_t kend( ( IsUpper_v<MT5> )
1140 ?( ( IsLower_v<MT4> )
1141 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1142 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1143 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
1145 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1147 for(
size_t k=kbegin; k<kend; ++k ) {
1151 const SIMDType b1(
set( B(k,j ) ) );
1152 const SIMDType b2(
set( B(k,j+1UL) ) );
1161 C.store( i , j , xmm1 );
1163 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1164 C.store( i , j+1UL, xmm4 );
1165 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
1166 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
1171 const size_t kbegin( ( IsLower_v<MT5> )
1172 ?( ( IsUpper_v<MT4> )
1173 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1174 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1175 :( IsUpper_v<MT4> ? i : 0UL ) );
1176 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
1180 for(
size_t k=kbegin; k<kend; ++k ) {
1181 const SIMDType b1(
set( B(k,j) ) );
1182 xmm1 += A.load(i ,k) * b1;
1183 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1184 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1187 C.store( i , j, xmm1 );
1189 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1196 size_t j(
UPP ? i : 0UL );
1198 for( ; (j+4UL) <= jend; j+=4UL )
1200 const size_t kbegin( ( IsLower_v<MT5> )
1201 ?( ( IsUpper_v<MT4> )
1202 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1203 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1204 :( IsUpper_v<MT4> ? i : 0UL ) );
1205 const size_t kend( ( IsUpper_v<MT5> )
1206 ?( ( IsLower_v<MT4> )
1207 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1208 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1209 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1211 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1213 for(
size_t k=kbegin; k<kend; ++k ) {
1216 const SIMDType b1(
set( B(k,j ) ) );
1217 const SIMDType b2(
set( B(k,j+1UL) ) );
1218 const SIMDType b3(
set( B(k,j+2UL) ) );
1219 const SIMDType b4(
set( B(k,j+3UL) ) );
1230 C.store( i , j , xmm1 );
1232 C.store( i , j+1UL, xmm3 );
1233 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1234 C.store( i , j+2UL, xmm5 );
1235 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1236 C.store( i , j+3UL, xmm7 );
1237 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
1240 for( ; (j+3UL) <= jend; j+=3UL )
1242 const size_t kbegin( ( IsLower_v<MT5> )
1243 ?( ( IsUpper_v<MT4> )
1244 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1245 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1246 :( IsUpper_v<MT4> ? i : 0UL ) );
1247 const size_t kend( ( IsUpper_v<MT5> )
1248 ?( ( IsLower_v<MT4> )
1249 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1250 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1251 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1253 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1255 for(
size_t k=kbegin; k<kend; ++k ) {
1258 const SIMDType b1(
set( B(k,j ) ) );
1259 const SIMDType b2(
set( B(k,j+1UL) ) );
1260 const SIMDType b3(
set( B(k,j+2UL) ) );
1269 C.store( i , j , xmm1 );
1271 C.store( i , j+1UL, xmm3 );
1272 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1273 C.store( i , j+2UL, xmm5 );
1274 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1277 for( ; (j+2UL) <= jend; j+=2UL )
1279 const size_t kbegin( ( IsLower_v<MT5> )
1280 ?( ( IsUpper_v<MT4> )
1281 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1282 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1283 :( IsUpper_v<MT4> ? i : 0UL ) );
1284 const size_t kend( ( IsUpper_v<MT5> )
1285 ?( ( IsLower_v<MT4> )
1286 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1287 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1288 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1290 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1293 for( ; (k+2UL) <= kend; k+=2UL ) {
1294 const SIMDType a1( A.load(i ,k ) );
1296 const SIMDType a3( A.load(i ,k+1UL) );
1298 const SIMDType b1(
set( B(k ,j ) ) );
1299 const SIMDType b2(
set( B(k ,j+1UL) ) );
1300 const SIMDType b3(
set( B(k+1UL,j ) ) );
1301 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
1312 for( ; k<kend; ++k ) {
1315 const SIMDType b1(
set( B(k,j ) ) );
1316 const SIMDType b2(
set( B(k,j+1UL) ) );
1323 C.store( i , j , xmm1+xmm5 );
1324 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
1325 C.store( i , j+1UL, xmm3+xmm7 );
1326 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
1331 const size_t kbegin( ( IsLower_v<MT5> )
1332 ?( ( IsUpper_v<MT4> )
1333 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1334 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1335 :( IsUpper_v<MT4> ? i : 0UL ) );
1336 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
1341 for( ; (k+2UL) <= kend; k+=2UL ) {
1342 const SIMDType b1(
set( B(k ,j) ) );
1343 const SIMDType b2(
set( B(k+1UL,j) ) );
1344 xmm1 += A.load(i ,k ) * b1;
1345 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
1346 xmm3 += A.load(i ,k+1UL) * b2;
1347 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
1350 for( ; k<kend; ++k ) {
1351 const SIMDType b1(
set( B(k,j) ) );
1352 xmm1 += A.load(i ,k) * b1;
1356 C.store( i , j, xmm1+xmm3 );
1357 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
1364 size_t j(
UPP ? i : 0UL );
1366 for( ; (j+4UL) <= jend; j+=4UL )
1368 const size_t kbegin( ( IsLower_v<MT5> )
1369 ?( ( IsUpper_v<MT4> )
1370 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1371 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1372 :( IsUpper_v<MT4> ? i : 0UL ) );
1373 const size_t kend( ( IsUpper_v<MT5> )
1374 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1377 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1380 for( ; (k+2UL) <= kend; k+=2UL ) {
1382 const SIMDType a2( A.load(i,k+1UL) );
1383 xmm1 += a1 *
set( B(k ,j ) );
1384 xmm2 += a1 *
set( B(k ,j+1UL) );
1385 xmm3 += a1 *
set( B(k ,j+2UL) );
1386 xmm4 += a1 *
set( B(k ,j+3UL) );
1387 xmm5 += a2 *
set( B(k+1UL,j ) );
1388 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
1389 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
1390 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
1393 for( ; k<kend; ++k ) {
1395 xmm1 += a1 *
set( B(k,j ) );
1396 xmm2 += a1 *
set( B(k,j+1UL) );
1397 xmm3 += a1 *
set( B(k,j+2UL) );
1398 xmm4 += a1 *
set( B(k,j+3UL) );
1401 C.store( i, j , xmm1+xmm5 );
1402 C.store( i, j+1UL, xmm2+xmm6 );
1403 C.store( i, j+2UL, xmm3+xmm7 );
1404 C.store( i, j+3UL, xmm4+xmm8 );
1407 for( ; (j+3UL) <= jend; j+=3UL )
1409 const size_t kbegin( ( IsLower_v<MT5> )
1410 ?( ( IsUpper_v<MT4> )
1411 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1412 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1413 :( IsUpper_v<MT4> ? i : 0UL ) );
1414 const size_t kend( ( IsUpper_v<MT5> )
1415 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1418 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1421 for( ; (k+2UL) <= kend; k+=2UL ) {
1423 const SIMDType a2( A.load(i,k+1UL) );
1424 xmm1 += a1 *
set( B(k ,j ) );
1425 xmm2 += a1 *
set( B(k ,j+1UL) );
1426 xmm3 += a1 *
set( B(k ,j+2UL) );
1427 xmm4 += a2 *
set( B(k+1UL,j ) );
1428 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
1429 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
1432 for( ; k<kend; ++k ) {
1434 xmm1 += a1 *
set( B(k,j ) );
1435 xmm2 += a1 *
set( B(k,j+1UL) );
1436 xmm3 += a1 *
set( B(k,j+2UL) );
1439 C.store( i, j , xmm1+xmm4 );
1440 C.store( i, j+1UL, xmm2+xmm5 );
1441 C.store( i, j+2UL, xmm3+xmm6 );
1444 for( ; (j+2UL) <= jend; j+=2UL )
1446 const size_t kbegin( ( IsLower_v<MT5> )
1447 ?( ( IsUpper_v<MT4> )
1448 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1449 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1450 :( IsUpper_v<MT4> ? i : 0UL ) );
1451 const size_t kend( ( IsUpper_v<MT5> )
1452 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1458 for( ; (k+2UL) <= kend; k+=2UL ) {
1460 const SIMDType a2( A.load(i,k+1UL) );
1461 xmm1 += a1 *
set( B(k ,j ) );
1462 xmm2 += a1 *
set( B(k ,j+1UL) );
1463 xmm3 += a2 *
set( B(k+1UL,j ) );
1464 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
1467 for( ; k<kend; ++k ) {
1469 xmm1 += a1 *
set( B(k,j ) );
1470 xmm2 += a1 *
set( B(k,j+1UL) );
1473 C.store( i, j , xmm1+xmm3 );
1474 C.store( i, j+1UL, xmm2+xmm4 );
1479 const size_t kbegin( ( IsLower_v<MT5> )
1480 ?( ( IsUpper_v<MT4> )
1481 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1482 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1483 :( IsUpper_v<MT4> ? i : 0UL ) );
1488 for( ; (k+2UL) <= K; k+=2UL ) {
1489 xmm1 += A.load(i,k ) *
set( B(k ,j) );
1490 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
1494 xmm1 += A.load(i,k) *
set( B(k,j) );
1497 C.store( i, j, xmm1+xmm2 );
1501 for( ; remainder && i<M; ++i )
1503 size_t j(
LOW &&
UPP ? i : 0UL );
1505 for( ; (j+2UL) <= N; j+=2UL )
1507 const size_t kbegin( ( IsLower_v<MT5> )
1508 ?( ( IsUpper_v<MT4> )
1509 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1510 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1511 :( IsUpper_v<MT4> ? i : 0UL ) );
1512 const size_t kend( ( IsUpper_v<MT5> )
1513 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1519 for(
size_t k=kbegin; k<kend; ++k ) {
1520 value1 += A(i,k) * B(k,j );
1521 value2 += A(i,k) * B(k,j+1UL);
1525 C(i,j+1UL) = value2;
1530 const size_t kbegin( ( IsLower_v<MT5> )
1531 ?( ( IsUpper_v<MT4> )
1532 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1533 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1534 :( IsUpper_v<MT4> ? i : 0UL ) );
1538 for(
size_t k=kbegin; k<K; ++k ) {
1539 value += A(i,k) * B(k,j);
1548 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
1550 for(
size_t i=0UL; i<iend; ++i ) {
1551 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1556 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
1558 for(
size_t i=0UL; i<iend; ++i ) {
1564 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
1566 for(
size_t j=0UL; j<jend; ++j ) {
1589 template<
typename MT3
1592 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1593 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1595 selectDefaultAssignKernel( C, A, B );
1615 template<
typename MT3
1618 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1619 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1649 template<
typename MT3
1652 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1653 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1655 selectLargeAssignKernel( C, A, B );
1661 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1675 template<
typename MT3
1678 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1679 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1681 using ET = ElementType_t<MT3>;
1683 if( IsTriangular_v<MT4> ) {
1685 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1687 else if( IsTriangular_v<MT5> ) {
1689 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1692 gemm( C, A, B, ET(1), ET(0) );
1712 template<
typename MT
1715 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1719 using TmpType = If_t< SO, ResultType, OppositeType >;
1731 const ForwardFunctor fwd;
1733 const TmpType tmp(
serial( rhs ) );
1734 assign( ~lhs, fwd( tmp ) );
1754 template<
typename MT >
1756 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1765 const ForwardFunctor fwd;
1767 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1768 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1769 else if( IsSymmetric_v<MT1> )
1770 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1772 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1790 template<
typename MT
1792 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
1793 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1800 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1814 TDMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1830 template<
typename MT3
1833 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1835 if( ( IsDiagonal_v<MT4> ) ||
1836 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
1837 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
1838 selectSmallAddAssignKernel( C, A, B );
1840 selectBlasAddAssignKernel( C, A, B );
1859 template<
typename MT3
1862 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1863 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1865 const size_t M( A.rows() );
1866 const size_t N( B.columns() );
1867 const size_t K( A.columns() );
1871 for(
size_t j=0UL; j<N; ++j )
1873 const size_t kbegin( ( IsLower_v<MT5> )
1874 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1876 const size_t kend( ( IsUpper_v<MT5> )
1877 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1881 for(
size_t k=kbegin; k<kend; ++k )
1883 const size_t ibegin( ( IsLower_v<MT4> )
1884 ?( ( IsStrictlyLower_v<MT4> )
1885 ?(
LOW ?
max(j,k+1UL) : k+1UL )
1886 :(
LOW ?
max(j,k) : k ) )
1887 :(
LOW ? j : 0UL ) );
1888 const size_t iend( ( IsUpper_v<MT4> )
1889 ?( ( IsStrictlyUpper_v<MT4> )
1890 ?(
UPP ?
min(j+1UL,k) : k )
1891 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
1892 :(
UPP ? j+1UL : M ) );
1894 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
1897 const size_t inum( iend - ibegin );
1898 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1900 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1901 C(i ,j) += A(i ,k) * B(k,j);
1902 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1905 C(ipos,j) += A(ipos,k) * B(k,j);
1927 template<
typename MT3
1930 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1931 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1935 const size_t M( A.rows() );
1936 const size_t N( B.columns() );
1938 for(
size_t j=0UL; j<N; ++j )
1940 const size_t ibegin( ( IsLower_v<MT4> )
1941 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
1943 const size_t iend( ( IsUpper_v<MT4> )
1944 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
1948 const size_t inum( iend - ibegin );
1949 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1951 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1952 C(i ,j) += A(i ,j) * B(j,j);
1953 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
1956 C(ipos,j) += A(ipos,j) * B(j,j);
1977 template<
typename MT3
1980 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1981 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1985 const size_t M( A.rows() );
1986 const size_t N( B.columns() );
1988 for(
size_t j=0UL; j<N; ++j )
1990 const size_t ibegin( ( IsLower_v<MT5> )
1991 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1993 const size_t iend( ( IsUpper_v<MT5> )
1994 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1998 const size_t inum( iend - ibegin );
1999 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2001 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2002 C(i ,j) += A(i ,i ) * B(i ,j);
2003 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2006 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2027 template<
typename MT3
2030 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2031 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2035 for(
size_t i=0UL; i<A.rows(); ++i ) {
2036 C(i,i) += A(i,i) * B(i,i);
2056 template<
typename MT3
2059 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2060 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2062 selectDefaultAddAssignKernel( C, A, B );
2082 template<
typename MT3
2085 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2086 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2093 const ForwardFunctor fwd;
2095 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2096 const OppositeType_t<MT5> tmp(
serial( B ) );
2097 addAssign( C, fwd( A * tmp ) );
2099 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2100 const OppositeType_t<MT4> tmp(
serial( A ) );
2101 addAssign( C, fwd( tmp * B ) );
2103 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2104 const OppositeType_t<MT5> tmp(
serial( B ) );
2105 addAssign( C, fwd( A * tmp ) );
2108 const OppositeType_t<MT4> tmp(
serial( A ) );
2109 addAssign( C, fwd( tmp * B ) );
2130 template<
typename MT3
2133 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2134 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2136 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2138 const size_t M( A.rows() );
2139 const size_t N( B.columns() );
2140 const size_t K( A.columns() );
2144 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
2149 if( IsIntegral_v<ElementType> )
2152 for(
size_t j=0UL; j<N; ++j )
2154 const size_t kbegin( ( IsLower_v<MT5> )
2155 ?( ( IsUpper_v<MT4> )
2156 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2157 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2158 :( IsUpper_v<MT4> ? i : 0UL ) );
2159 const size_t kend( ( IsUpper_v<MT5> )
2160 ?( ( IsLower_v<MT4> )
2161 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2162 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2163 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
2174 for(
size_t k=kbegin; k<kend; ++k ) {
2175 const SIMDType b1(
set( B(k,j) ) );
2176 xmm1 += A.load(i ,k) * b1;
2177 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2178 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2179 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2180 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2181 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
2182 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
2183 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
2186 C.store( i , j, xmm1 );
2188 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2189 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2190 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2191 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
2192 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
2193 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
2202 for( ; (j+2UL) <= N; j+=2UL )
2204 const size_t kbegin( ( IsLower_v<MT5> )
2205 ?( ( IsUpper_v<MT4> )
2206 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2207 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2208 :( IsUpper_v<MT4> ? i : 0UL ) );
2209 const size_t kend( ( IsUpper_v<MT5> )
2210 ?( ( IsLower_v<MT4> )
2211 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2212 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2213 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
2220 SIMDType xmm6 ( C.load(i ,j+1UL) );
2226 for(
size_t k=kbegin; k<kend; ++k ) {
2232 const SIMDType b1(
set( B(k,j ) ) );
2233 const SIMDType b2(
set( B(k,j+1UL) ) );
2246 C.store( i , j , xmm1 );
2248 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2249 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2250 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
2251 C.store( i , j+1UL, xmm6 );
2252 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
2253 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
2254 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
2255 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
2260 const size_t kbegin( ( IsLower_v<MT5> )
2261 ?( ( IsUpper_v<MT4> )
2262 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2263 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2264 :( IsUpper_v<MT4> ? i : 0UL ) );
2265 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
2273 for(
size_t k=kbegin; k<kend; ++k ) {
2274 const SIMDType b1(
set( B(k,j) ) );
2275 xmm1 += A.load(i ,k) * b1;
2276 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2277 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2278 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2279 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2282 C.store( i , j, xmm1 );
2284 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2285 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2286 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2294 for( ; (j+2UL) <= N; j+=2UL )
2296 const size_t kbegin( ( IsLower_v<MT5> )
2297 ?( ( IsUpper_v<MT4> )
2298 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2299 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2300 :( IsUpper_v<MT4> ? i : 0UL ) );
2301 const size_t kend( ( IsUpper_v<MT5> )
2302 ?( ( IsLower_v<MT4> )
2303 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2304 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2305 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
2316 for(
size_t k=kbegin; k<kend; ++k ) {
2321 const SIMDType b1(
set( B(k,j ) ) );
2322 const SIMDType b2(
set( B(k,j+1UL) ) );
2333 C.store( i , j , xmm1 );
2335 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2336 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2337 C.store( i , j+1UL, xmm5 );
2338 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
2339 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
2340 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
2345 const size_t kbegin( ( IsLower_v<MT5> )
2346 ?( ( IsUpper_v<MT4> )
2347 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2348 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2349 :( IsUpper_v<MT4> ? i : 0UL ) );
2350 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
2357 for(
size_t k=kbegin; k<kend; ++k ) {
2358 const SIMDType b1(
set( B(k,j) ) );
2359 xmm1 += A.load(i ,k) * b1;
2360 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2361 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2362 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2365 C.store( i , j, xmm1 );
2367 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2368 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2376 for( ; (j+2UL) <= N; j+=2UL )
2378 const size_t kbegin( ( IsLower_v<MT5> )
2379 ?( ( IsUpper_v<MT4> )
2380 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2381 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2382 :( IsUpper_v<MT4> ? i : 0UL ) );
2383 const size_t kend( ( IsUpper_v<MT5> )
2384 ?( ( IsLower_v<MT4> )
2385 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2386 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2387 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
2396 for(
size_t k=kbegin; k<kend; ++k ) {
2400 const SIMDType b1(
set( B(k,j ) ) );
2401 const SIMDType b2(
set( B(k,j+1UL) ) );
2410 C.store( i , j , xmm1 );
2412 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2413 C.store( i , j+1UL, xmm4 );
2414 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2415 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2420 const size_t kbegin( ( IsLower_v<MT5> )
2421 ?( ( IsUpper_v<MT4> )
2422 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2423 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2424 :( IsUpper_v<MT4> ? i : 0UL ) );
2425 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2431 for(
size_t k=kbegin; k<kend; ++k ) {
2432 const SIMDType b1(
set( B(k,j) ) );
2433 xmm1 += A.load(i ,k) * b1;
2434 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2435 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2438 C.store( i , j, xmm1 );
2440 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2447 size_t j(
UPP ? i : 0UL );
2449 for( ; (j+4UL) <= jend; j+=4UL )
2451 const size_t kbegin( ( IsLower_v<MT5> )
2452 ?( ( IsUpper_v<MT4> )
2453 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2454 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2455 :( IsUpper_v<MT4> ? i : 0UL ) );
2456 const size_t kend( ( IsUpper_v<MT5> )
2457 ?( ( IsLower_v<MT4> )
2458 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2459 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2460 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2471 for(
size_t k=kbegin; k<kend; ++k ) {
2474 const SIMDType b1(
set( B(k,j ) ) );
2475 const SIMDType b2(
set( B(k,j+1UL) ) );
2476 const SIMDType b3(
set( B(k,j+2UL) ) );
2477 const SIMDType b4(
set( B(k,j+3UL) ) );
2488 C.store( i , j , xmm1 );
2490 C.store( i , j+1UL, xmm3 );
2491 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2492 C.store( i , j+2UL, xmm5 );
2493 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2494 C.store( i , j+3UL, xmm7 );
2495 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2498 for( ; (j+3UL) <= jend; j+=3UL )
2500 const size_t kbegin( ( IsLower_v<MT5> )
2501 ?( ( IsUpper_v<MT4> )
2502 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2503 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2504 :( IsUpper_v<MT4> ? i : 0UL ) );
2505 const size_t kend( ( IsUpper_v<MT5> )
2506 ?( ( IsLower_v<MT4> )
2507 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2508 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2509 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2518 for(
size_t k=kbegin; k<kend; ++k ) {
2521 const SIMDType b1(
set( B(k,j ) ) );
2522 const SIMDType b2(
set( B(k,j+1UL) ) );
2523 const SIMDType b3(
set( B(k,j+2UL) ) );
2532 C.store( i , j , xmm1 );
2534 C.store( i , j+1UL, xmm3 );
2535 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2536 C.store( i , j+2UL, xmm5 );
2537 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2540 for( ; (j+2UL) <= jend; j+=2UL )
2542 const size_t kbegin( ( IsLower_v<MT5> )
2543 ?( ( IsUpper_v<MT4> )
2544 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2545 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2546 :( IsUpper_v<MT4> ? i : 0UL ) );
2547 const size_t kend( ( IsUpper_v<MT5> )
2548 ?( ( IsLower_v<MT4> )
2549 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2550 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2551 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2560 for( ; (k+2UL) < kend; k+=2UL ) {
2561 const SIMDType a1( A.load(i ,k ) );
2563 const SIMDType a3( A.load(i ,k+1UL) );
2565 const SIMDType b1(
set( B(k ,j ) ) );
2566 const SIMDType b2(
set( B(k ,j+1UL) ) );
2567 const SIMDType b3(
set( B(k+1UL,j ) ) );
2568 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
2579 for( ; k<kend; ++k ) {
2582 const SIMDType b1(
set( B(k,j ) ) );
2583 const SIMDType b2(
set( B(k,j+1UL) ) );
2590 C.store( i , j , xmm1+xmm5 );
2591 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
2592 C.store( i , j+1UL, xmm3+xmm7 );
2593 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
2598 const size_t kbegin( ( IsLower_v<MT5> )
2599 ?( ( IsUpper_v<MT4> )
2600 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2601 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2602 :( IsUpper_v<MT4> ? i : 0UL ) );
2603 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2610 for( ; (k+2UL) <= kend; k+=2UL ) {
2611 const SIMDType b1(
set( B(k ,j) ) );
2612 const SIMDType b2(
set( B(k+1UL,j) ) );
2613 xmm1 += A.load(i ,k ) * b1;
2614 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
2615 xmm3 += A.load(i ,k+1UL) * b2;
2616 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
2619 for( ; k<kend; ++k ) {
2620 const SIMDType b1(
set( B(k,j) ) );
2621 xmm1 += A.load(i ,k) * b1;
2625 C.store( i , j, xmm1+xmm3 );
2626 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
2633 size_t j(
UPP ? i : 0UL );
2635 for( ; (j+4UL) <= jend; j+=4UL )
2637 const size_t kbegin( ( IsLower_v<MT5> )
2638 ?( ( IsUpper_v<MT4> )
2639 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2640 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2641 :( IsUpper_v<MT4> ? i : 0UL ) );
2642 const size_t kend( ( IsUpper_v<MT5> )
2643 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2653 for( ; (k+2UL) <= kend; k+=2UL ) {
2655 const SIMDType a2( A.load(i,k+1UL) );
2656 xmm1 += a1 *
set( B(k ,j ) );
2657 xmm2 += a1 *
set( B(k ,j+1UL) );
2658 xmm3 += a1 *
set( B(k ,j+2UL) );
2659 xmm4 += a1 *
set( B(k ,j+3UL) );
2660 xmm5 += a2 *
set( B(k+1UL,j ) );
2661 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
2662 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
2663 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
2666 for( ; k<kend; ++k ) {
2668 xmm1 += a1 *
set( B(k,j ) );
2669 xmm2 += a1 *
set( B(k,j+1UL) );
2670 xmm3 += a1 *
set( B(k,j+2UL) );
2671 xmm4 += a1 *
set( B(k,j+3UL) );
2674 C.store( i, j , xmm1+xmm5 );
2675 C.store( i, j+1UL, xmm2+xmm6 );
2676 C.store( i, j+2UL, xmm3+xmm7 );
2677 C.store( i, j+3UL, xmm4+xmm8 );
2680 for( ; (j+3UL) <= jend; j+=3UL )
2682 const size_t kbegin( ( IsLower_v<MT5> )
2683 ?( ( IsUpper_v<MT4> )
2684 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2685 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2686 :( IsUpper_v<MT4> ? i : 0UL ) );
2687 const size_t kend( ( IsUpper_v<MT5> )
2688 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2697 for( ; (k+2UL) <= kend; k+=2UL ) {
2699 const SIMDType a2( A.load(i,k+1UL) );
2700 xmm1 += a1 *
set( B(k ,j ) );
2701 xmm2 += a1 *
set( B(k ,j+1UL) );
2702 xmm3 += a1 *
set( B(k ,j+2UL) );
2703 xmm4 += a2 *
set( B(k+1UL,j ) );
2704 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
2705 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
2708 for( ; k<kend; ++k ) {
2710 xmm1 += a1 *
set( B(k,j ) );
2711 xmm2 += a1 *
set( B(k,j+1UL) );
2712 xmm3 += a1 *
set( B(k,j+2UL) );
2715 C.store( i, j , xmm1+xmm4 );
2716 C.store( i, j+1UL, xmm2+xmm5 );
2717 C.store( i, j+2UL, xmm3+xmm6 );
2720 for( ; (j+2UL) <= jend; j+=2UL )
2722 const size_t kbegin( ( IsLower_v<MT5> )
2723 ?( ( IsUpper_v<MT4> )
2724 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2725 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2726 :( IsUpper_v<MT4> ? i : 0UL ) );
2727 const size_t kend( ( IsUpper_v<MT5> )
2728 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2736 for( ; (k+2UL) <= kend; k+=2UL ) {
2738 const SIMDType a2( A.load(i,k+1UL) );
2739 xmm1 += a1 *
set( B(k ,j ) );
2740 xmm2 += a1 *
set( B(k ,j+1UL) );
2741 xmm3 += a2 *
set( B(k+1UL,j ) );
2742 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
2745 for( ; k<kend; ++k ) {
2747 xmm1 += a1 *
set( B(k,j ) );
2748 xmm2 += a1 *
set( B(k,j+1UL) );
2751 C.store( i, j , xmm1+xmm3 );
2752 C.store( i, j+1UL, xmm2+xmm4 );
2757 const size_t kbegin( ( IsLower_v<MT5> )
2758 ?( ( IsUpper_v<MT4> )
2759 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2760 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2761 :( IsUpper_v<MT4> ? i : 0UL ) );
2767 for( ; (k+2UL) <= K; k+=2UL ) {
2768 xmm1 += A.load(i,k ) *
set( B(k ,j) );
2769 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
2773 xmm1 += A.load(i,k) *
set( B(k,j) );
2776 C.store( i, j, xmm1+xmm2 );
2780 for( ; remainder && i<M; ++i )
2782 const size_t jend(
LOW ? i+1UL : N );
2783 size_t j(
UPP ? i : 0UL );
2785 for( ; (j+2UL) <= jend; j+=2UL )
2787 const size_t kbegin( ( IsLower_v<MT5> )
2788 ?( ( IsUpper_v<MT4> )
2789 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2790 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2791 :( IsUpper_v<MT4> ? i : 0UL ) );
2792 const size_t kend( ( IsUpper_v<MT5> )
2793 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
2799 for(
size_t k=kbegin; k<kend; ++k ) {
2800 value1 += A(i,k) * B(k,j );
2801 value2 += A(i,k) * B(k,j+1UL);
2805 C(i,j+1UL) = value2;
2810 const size_t kbegin( ( IsLower_v<MT5> )
2811 ?( ( IsUpper_v<MT4> )
2812 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2813 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2814 :( IsUpper_v<MT4> ? i : 0UL ) );
2818 for(
size_t k=kbegin; k<K; ++k ) {
2819 value += A(i,k) * B(k,j);
2843 template<
typename MT3
2846 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2847 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2849 selectDefaultAddAssignKernel( C, A, B );
2869 template<
typename MT3
2872 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2873 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2899 template<
typename MT3
2902 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2903 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2905 selectLargeAddAssignKernel( C, A, B );
2911 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2925 template<
typename MT3
2928 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2929 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2931 using ET = ElementType_t<MT3>;
2933 if( IsTriangular_v<MT4> ) {
2934 ResultType_t<MT3> tmp(
serial( B ) );
2935 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2936 addAssign( C, tmp );
2938 else if( IsTriangular_v<MT5> ) {
2939 ResultType_t<MT3> tmp(
serial( A ) );
2940 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2941 addAssign( C, tmp );
2944 gemm( C, A, B, ET(1), ET(1) );
2966 template<
typename MT >
2968 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2977 const ForwardFunctor fwd;
2979 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
2980 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
2981 else if( IsSymmetric_v<MT1> )
2982 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
2984 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
3006 template<
typename MT
3008 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
3009 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3016 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3030 TDMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3046 template<
typename MT3
3049 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3051 if( ( IsDiagonal_v<MT4> ) ||
3052 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
3053 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3054 selectSmallSubAssignKernel( C, A, B );
3056 selectBlasSubAssignKernel( C, A, B );
3075 template<
typename MT3
3078 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3079 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3081 const size_t M( A.rows() );
3082 const size_t N( B.columns() );
3083 const size_t K( A.columns() );
3087 for(
size_t j=0UL; j<N; ++j )
3089 const size_t kbegin( ( IsLower_v<MT5> )
3090 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3092 const size_t kend( ( IsUpper_v<MT5> )
3093 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3097 for(
size_t k=kbegin; k<kend; ++k )
3099 const size_t ibegin( ( IsLower_v<MT4> )
3100 ?( ( IsStrictlyLower_v<MT4> )
3101 ?(
LOW ?
max(j,k+1UL) : k+1UL )
3102 :(
LOW ?
max(j,k) : k ) )
3103 :(
LOW ? j : 0UL ) );
3104 const size_t iend( ( IsUpper_v<MT4> )
3105 ?( ( IsStrictlyUpper_v<MT4> )
3106 ?(
UPP ?
min(j+1UL,k) : k )
3107 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
3108 :(
UPP ? j+1UL : M ) );
3110 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
3113 const size_t inum( iend - ibegin );
3114 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3116 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3117 C(i ,j) -= A(i ,k) * B(k,j);
3118 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3121 C(ipos,j) -= A(ipos,k) * B(k,j);
3143 template<
typename MT3
3146 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3147 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3151 const size_t M( A.rows() );
3152 const size_t N( B.columns() );
3154 for(
size_t j=0UL; j<N; ++j )
3156 const size_t ibegin( ( IsLower_v<MT4> )
3157 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3159 const size_t iend( ( IsUpper_v<MT4> )
3160 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3164 const size_t inum( iend - ibegin );
3165 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3167 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3168 C(i ,j) -= A(i ,j) * B(j,j);
3169 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3172 C(ipos,j) -= A(ipos,j) * B(j,j);
3193 template<
typename MT3
3196 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3197 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3201 const size_t M( A.rows() );
3202 const size_t N( B.columns() );
3204 for(
size_t j=0UL; j<N; ++j )
3206 const size_t ibegin( ( IsLower_v<MT5> )
3207 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3209 const size_t iend( ( IsUpper_v<MT5> )
3210 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3214 const size_t inum( iend - ibegin );
3215 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3217 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3218 C(i ,j) -= A(i ,i ) * B(i ,j);
3219 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3222 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3243 template<
typename MT3
3246 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3247 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3251 for(
size_t i=0UL; i<A.rows(); ++i ) {
3252 C(i,i) -= A(i,i) * B(i,i);
3272 template<
typename MT3
3275 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3276 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3278 selectDefaultSubAssignKernel( C, A, B );
3298 template<
typename MT3
3301 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3302 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3309 const ForwardFunctor fwd;
3311 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3312 const OppositeType_t<MT5> tmp(
serial( B ) );
3313 subAssign( C, fwd( A * tmp ) );
3315 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3316 const OppositeType_t<MT4> tmp(
serial( A ) );
3317 subAssign( C, fwd( tmp * B ) );
3319 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3320 const OppositeType_t<MT5> tmp(
serial( B ) );
3321 subAssign( C, fwd( A * tmp ) );
3324 const OppositeType_t<MT4> tmp(
serial( A ) );
3325 subAssign( C, fwd( tmp * B ) );
3346 template<
typename MT3
3349 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3350 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3352 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3354 const size_t M( A.rows() );
3355 const size_t N( B.columns() );
3356 const size_t K( A.columns() );
3360 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
3365 if( IsIntegral_v<ElementType> )
3368 for(
size_t j=0UL; j<N; ++j )
3370 const size_t kbegin( ( IsLower_v<MT5> )
3371 ?( ( IsUpper_v<MT4> )
3372 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3373 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3374 :( IsUpper_v<MT4> ? i : 0UL ) );
3375 const size_t kend( ( IsUpper_v<MT5> )
3376 ?( ( IsLower_v<MT4> )
3377 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3378 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3379 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
3390 for(
size_t k=kbegin; k<kend; ++k ) {
3391 const SIMDType b1(
set( B(k,j) ) );
3392 xmm1 -= A.load(i ,k) * b1;
3393 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3394 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3395 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3396 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3397 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
3398 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
3399 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
3402 C.store( i , j, xmm1 );
3404 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3405 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3406 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3407 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
3408 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
3409 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
3418 for( ; (j+2UL) <= N; j+=2UL )
3420 const size_t kbegin( ( IsLower_v<MT5> )
3421 ?( ( IsUpper_v<MT4> )
3422 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3423 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3424 :( IsUpper_v<MT4> ? i : 0UL ) );
3425 const size_t kend( ( IsUpper_v<MT5> )
3426 ?( ( IsLower_v<MT4> )
3427 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3428 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3429 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
3436 SIMDType xmm6 ( C.load(i ,j+1UL) );
3442 for(
size_t k=kbegin; k<kend; ++k ) {
3448 const SIMDType b1(
set( B(k,j ) ) );
3449 const SIMDType b2(
set( B(k,j+1UL) ) );
3462 C.store( i , j , xmm1 );
3464 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3465 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3466 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
3467 C.store( i , j+1UL, xmm6 );
3468 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
3469 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
3470 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
3471 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
3476 const size_t kbegin( ( IsLower_v<MT5> )
3477 ?( ( IsUpper_v<MT4> )
3478 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3479 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3480 :( IsUpper_v<MT4> ? i : 0UL ) );
3481 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
3489 for(
size_t k=kbegin; k<kend; ++k ) {
3490 const SIMDType b1(
set( B(k,j) ) );
3491 xmm1 -= A.load(i ,k) * b1;
3492 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3493 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3494 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3495 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3498 C.store( i , j, xmm1 );
3500 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3501 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3502 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3510 for( ; (j+2UL) <= N; j+=2UL )
3512 const size_t kbegin( ( IsLower_v<MT5> )
3513 ?( ( IsUpper_v<MT4> )
3514 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3515 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3516 :( IsUpper_v<MT4> ? i : 0UL ) );
3517 const size_t kend( ( IsUpper_v<MT5> )
3518 ?( ( IsLower_v<MT4> )
3519 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3520 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3521 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
3532 for(
size_t k=kbegin; k<kend; ++k ) {
3537 const SIMDType b1(
set( B(k,j ) ) );
3538 const SIMDType b2(
set( B(k,j+1UL) ) );
3549 C.store( i , j , xmm1 );
3551 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3552 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3553 C.store( i , j+1UL, xmm5 );
3554 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
3555 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
3556 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
3561 const size_t kbegin( ( IsLower_v<MT5> )
3562 ?( ( IsUpper_v<MT4> )
3563 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3564 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3565 :( IsUpper_v<MT4> ? i : 0UL ) );
3566 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
3573 for(
size_t k=kbegin; k<kend; ++k ) {
3574 const SIMDType b1(
set( B(k,j) ) );
3575 xmm1 -= A.load(i ,k) * b1;
3576 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3577 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3578 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3581 C.store( i , j, xmm1 );
3583 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3584 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3592 for( ; (j+2UL) <= N; j+=2UL )
3594 const size_t kbegin( ( IsLower_v<MT5> )
3595 ?( ( IsUpper_v<MT4> )
3596 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3597 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3598 :( IsUpper_v<MT4> ? i : 0UL ) );
3599 const size_t kend( ( IsUpper_v<MT5> )
3600 ?( ( IsLower_v<MT4> )
3601 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3602 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3603 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
3612 for(
size_t k=kbegin; k<kend; ++k ) {
3616 const SIMDType b1(
set( B(k,j ) ) );
3617 const SIMDType b2(
set( B(k,j+1UL) ) );
3626 C.store( i , j , xmm1 );
3628 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3629 C.store( i , j+1UL, xmm4 );
3630 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
3631 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
3636 const size_t kbegin( ( IsLower_v<MT5> )
3637 ?( ( IsUpper_v<MT4> )
3638 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3639 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3640 :( IsUpper_v<MT4> ? i : 0UL ) );
3641 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
3647 for(
size_t k=kbegin; k<kend; ++k ) {
3648 const SIMDType b1(
set( B(k,j) ) );
3649 xmm1 -= A.load(i ,k) * b1;
3650 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3651 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3654 C.store( i , j, xmm1 );
3656 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3663 size_t j(
UPP ? i : 0UL );
3665 for( ; (j+4UL) <= jend; j+=4UL )
3667 const size_t kbegin( ( IsLower_v<MT5> )
3668 ?( ( IsUpper_v<MT4> )
3669 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3670 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3671 :( IsUpper_v<MT4> ? i : 0UL ) );
3672 const size_t kend( ( IsUpper_v<MT5> )
3673 ?( ( IsLower_v<MT4> )
3674 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3675 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3676 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3687 for(
size_t k=kbegin; k<kend; ++k ) {
3690 const SIMDType b1(
set( B(k,j ) ) );
3691 const SIMDType b2(
set( B(k,j+1UL) ) );
3692 const SIMDType b3(
set( B(k,j+2UL) ) );
3693 const SIMDType b4(
set( B(k,j+3UL) ) );
3704 C.store( i , j , xmm1 );
3706 C.store( i , j+1UL, xmm3 );
3707 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
3708 C.store( i , j+2UL, xmm5 );
3709 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
3710 C.store( i , j+3UL, xmm7 );
3711 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
3714 for( ; (j+3UL) <= jend; j+=3UL )
3716 const size_t kbegin( ( IsLower_v<MT5> )
3717 ?( ( IsUpper_v<MT4> )
3718 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3719 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3720 :( IsUpper_v<MT4> ? i : 0UL ) );
3721 const size_t kend( ( IsUpper_v<MT5> )
3722 ?( ( IsLower_v<MT4> )
3723 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3724 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3725 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3734 for(
size_t k=kbegin; k<kend; ++k ) {
3737 const SIMDType b1(
set( B(k,j ) ) );
3738 const SIMDType b2(
set( B(k,j+1UL) ) );
3739 const SIMDType b3(
set( B(k,j+2UL) ) );
3748 C.store( i , j , xmm1 );
3750 C.store( i , j+1UL, xmm3 );
3751 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
3752 C.store( i , j+2UL, xmm5 );
3753 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
3756 for( ; (j+2UL) <= jend; j+=2UL )
3758 const size_t kbegin( ( IsLower_v<MT5> )
3759 ?( ( IsUpper_v<MT4> )
3760 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3761 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3762 :( IsUpper_v<MT4> ? i : 0UL ) );
3763 const size_t kend( ( IsUpper_v<MT5> )
3764 ?( ( IsLower_v<MT4> )
3765 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3766 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3767 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3776 for( ; (k+2UL) <= kend; k+=2UL ) {
3777 const SIMDType a1( A.load(i ,k ) );
3779 const SIMDType a3( A.load(i ,k+1UL) );
3781 const SIMDType b1(
set( B(k ,j ) ) );
3782 const SIMDType b2(
set( B(k ,j+1UL) ) );
3783 const SIMDType b3(
set( B(k+1UL,j ) ) );
3784 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
3795 for( ; k<kend; ++k ) {
3798 const SIMDType b1(
set( B(k,j ) ) );
3799 const SIMDType b2(
set( B(k,j+1UL) ) );
3806 C.store( i , j , xmm1+xmm5 );
3807 C.store( i+
SIMDSIZE, j , xmm2+xmm6 );
3808 C.store( i , j+1UL, xmm3+xmm7 );
3809 C.store( i+
SIMDSIZE, j+1UL, xmm4+xmm8 );
3814 const size_t kbegin( ( IsLower_v<MT5> )
3815 ?( ( IsUpper_v<MT4> )
3816 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3817 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3818 :( IsUpper_v<MT4> ? i : 0UL ) );
3819 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
3826 for( ; (k+2UL) <= kend; k+=2UL ) {
3827 const SIMDType b1(
set( B(k ,j) ) );
3828 const SIMDType b2(
set( B(k+1UL,j) ) );
3829 xmm1 -= A.load(i ,k ) * b1;
3830 xmm2 -= A.load(i+
SIMDSIZE,k ) * b1;
3831 xmm3 -= A.load(i ,k+1UL) * b2;
3832 xmm4 -= A.load(i+
SIMDSIZE,k+1UL) * b2;
3835 for( ; k<kend; ++k ) {
3836 const SIMDType b1(
set( B(k,j) ) );
3837 xmm1 -= A.load(i ,k) * b1;
3841 C.store( i , j, xmm1+xmm3 );
3842 C.store( i+
SIMDSIZE, j, xmm2+xmm4 );
3849 size_t j(
UPP ? i : 0UL );
3851 for( ; (j+4UL) <= jend; j+=4UL )
3853 const size_t kbegin( ( IsLower_v<MT5> )
3854 ?( ( IsUpper_v<MT4> )
3855 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3856 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3857 :( IsUpper_v<MT4> ? i : 0UL ) );
3858 const size_t kend( ( IsUpper_v<MT5> )
3859 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
3869 for( ; (k+2UL) <= kend; k+=2UL ) {
3871 const SIMDType a2( A.load(i,k+1UL) );
3872 xmm1 -= a1 *
set( B(k ,j ) );
3873 xmm2 -= a1 *
set( B(k ,j+1UL) );
3874 xmm3 -= a1 *
set( B(k ,j+2UL) );
3875 xmm4 -= a1 *
set( B(k ,j+3UL) );
3876 xmm5 -= a2 *
set( B(k+1UL,j ) );
3877 xmm6 -= a2 *
set( B(k+1UL,j+1UL) );
3878 xmm7 -= a2 *
set( B(k+1UL,j+2UL) );
3879 xmm8 -= a2 *
set( B(k+1UL,j+3UL) );
3882 for( ; k<kend; ++k ) {
3884 xmm1 -= a1 *
set( B(k,j ) );
3885 xmm2 -= a1 *
set( B(k,j+1UL) );
3886 xmm3 -= a1 *
set( B(k,j+2UL) );
3887 xmm4 -= a1 *
set( B(k,j+3UL) );
3890 C.store( i, j , xmm1+xmm5 );
3891 C.store( i, j+1UL, xmm2+xmm6 );
3892 C.store( i, j+2UL, xmm3+xmm7 );
3893 C.store( i, j+3UL, xmm4+xmm8 );
3896 for( ; (j+3UL) <= jend; j+=3UL )
3898 const size_t kbegin( ( IsLower_v<MT5> )
3899 ?( ( IsUpper_v<MT4> )
3900 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3901 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3902 :( IsUpper_v<MT4> ? i : 0UL ) );
3903 const size_t kend( ( IsUpper_v<MT5> )
3904 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
3913 for( ; (k+2UL) <= kend; k+=2UL ) {
3915 const SIMDType a2( A.load(i,k+1UL) );
3916 xmm1 -= a1 *
set( B(k ,j ) );
3917 xmm2 -= a1 *
set( B(k ,j+1UL) );
3918 xmm3 -= a1 *
set( B(k ,j+2UL) );
3919 xmm4 -= a2 *
set( B(k+1UL,j ) );
3920 xmm5 -= a2 *
set( B(k+1UL,j+1UL) );
3921 xmm6 -= a2 *
set( B(k+1UL,j+2UL) );
3924 for( ; k<kend; ++k ) {
3926 xmm1 -= a1 *
set( B(k,j ) );
3927 xmm2 -= a1 *
set( B(k,j+1UL) );
3928 xmm3 -= a1 *
set( B(k,j+2UL) );
3931 C.store( i, j , xmm1+xmm4 );
3932 C.store( i, j+1UL, xmm2+xmm5 );
3933 C.store( i, j+2UL, xmm3+xmm6 );
3936 for( ; (j+2UL) <= jend; j+=2UL )
3938 const size_t kbegin( ( IsLower_v<MT5> )
3939 ?( ( IsUpper_v<MT4> )
3940 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3941 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3942 :( IsUpper_v<MT4> ? i : 0UL ) );
3943 const size_t kend( ( IsUpper_v<MT5> )
3944 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3952 for( ; (k+2UL) <= kend; k+=2UL ) {
3954 const SIMDType a2( A.load(i,k+1UL) );
3955 xmm1 -= a1 *
set( B(k ,j ) );
3956 xmm2 -= a1 *
set( B(k ,j+1UL) );
3957 xmm3 -= a2 *
set( B(k+1UL,j ) );
3958 xmm4 -= a2 *
set( B(k+1UL,j+1UL) );
3961 for( ; k<kend; ++k ) {
3963 xmm1 -= a1 *
set( B(k,j ) );
3964 xmm2 -= a1 *
set( B(k,j+1UL) );
3967 C.store( i, j , xmm1+xmm3 );
3968 C.store( i, j+1UL, xmm2+xmm4 );
3973 const size_t kbegin( ( IsLower_v<MT5> )
3974 ?( ( IsUpper_v<MT4> )
3975 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3976 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3977 :( IsUpper_v<MT4> ? i : 0UL ) );
3983 for( ; (k+2UL) <= K; k+=2UL ) {
3984 xmm1 -= A.load(i,k ) *
set( B(k ,j) );
3985 xmm2 -= A.load(i,k+1UL) *
set( B(k+1UL,j) );
3989 xmm1 -= A.load(i,k) *
set( B(k,j) );
3992 C.store( i, j, xmm1+xmm2 );
3996 for( ; remainder && i<M; ++i )
3998 const size_t jend(
LOW ? i+1UL : N );
3999 size_t j(
UPP ? i : 0UL );
4001 for( ; (j+2UL) <= jend; j+=2UL )
4003 const size_t kbegin( ( IsLower_v<MT5> )
4004 ?( ( IsUpper_v<MT4> )
4005 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4006 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4007 :( IsUpper_v<MT4> ? i : 0UL ) );
4008 const size_t kend( ( IsUpper_v<MT5> )
4009 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4015 for(
size_t k=kbegin; k<kend; ++k ) {
4016 value1 -= A(i,k) * B(k,j );
4017 value2 -= A(i,k) * B(k,j+1UL);
4021 C(i,j+1UL) = value2;
4026 const size_t kbegin( ( IsLower_v<MT5> )
4027 ?( ( IsUpper_v<MT4> )
4028 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4029 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4030 :( IsUpper_v<MT4> ? i : 0UL ) );
4034 for(
size_t k=kbegin; k<K; ++k ) {
4035 value -= A(i,k) * B(k,j);
4059 template<
typename MT3
4062 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4063 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4065 selectDefaultSubAssignKernel( C, A, B );
4085 template<
typename MT3
4088 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4089 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4115 template<
typename MT3
4118 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4119 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4121 selectLargeSubAssignKernel( C, A, B );
4127 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4141 template<
typename MT3
4144 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4145 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4147 using ET = ElementType_t<MT3>;
4149 if( IsTriangular_v<MT4> ) {
4150 ResultType_t<MT3> tmp(
serial( B ) );
4151 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4152 subAssign( C, tmp );
4154 else if( IsTriangular_v<MT5> ) {
4155 ResultType_t<MT3> tmp(
serial( A ) );
4156 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4157 subAssign( C, tmp );
4160 gemm( C, A, B, ET(-1), ET(1) );
4183 template<
typename MT >
4185 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4194 const ForwardFunctor fwd;
4196 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4197 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4198 else if( IsSymmetric_v<MT1> )
4199 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4201 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4223 template<
typename MT
4225 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
4237 schurAssign( ~lhs, tmp );
4266 template<
typename MT
4269 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4276 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4279 else if( rhs.lhs_.columns() == 0UL ) {
4315 template<
typename MT
4318 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4322 using TmpType = If_t< SO, ResultType, OppositeType >;
4334 const ForwardFunctor fwd;
4336 const TmpType tmp( rhs );
4357 template<
typename MT >
4359 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4368 const ForwardFunctor fwd;
4370 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4372 else if( IsSymmetric_v<MT1> )
4396 template<
typename MT
4399 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4406 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4441 template<
typename MT >
4443 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4452 const ForwardFunctor fwd;
4454 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4456 else if( IsSymmetric_v<MT1> )
4484 template<
typename MT
4487 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4494 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4529 template<
typename MT >
4531 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4540 const ForwardFunctor fwd;
4542 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4544 else if( IsSymmetric_v<MT1> )
4570 template<
typename MT
4630 template<
typename MT1
4637 class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4638 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4639 ,
private Computation
4644 using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4646 using RES = ResultType_t<MMM>;
4647 using RT1 = ResultType_t<MT1>;
4648 using RT2 = ResultType_t<MT2>;
4649 using ET1 = ElementType_t<RT1>;
4650 using ET2 = ElementType_t<RT2>;
4651 using CT1 = CompositeType_t<MT1>;
4652 using CT2 = CompositeType_t<MT2>;
4657 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4662 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4666 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4667 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4668 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4669 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4678 template<
typename T1,
typename T2,
typename T3 >
4679 static constexpr
bool CanExploitSymmetry_v =
4680 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4688 template<
typename T1,
typename T2,
typename T3 >
4689 static constexpr
bool IsEvaluationRequired_v =
4690 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4697 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4698 static constexpr
bool UseBlasKernel_v =
4700 !SYM && !HERM && !LOW && !UPP &&
4701 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4702 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4703 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4704 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4705 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4706 IsBLASCompatible_v< ElementType_t<T1> > &&
4707 IsBLASCompatible_v< ElementType_t<T2> > &&
4708 IsBLASCompatible_v< ElementType_t<T3> > &&
4709 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4710 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4711 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4718 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4719 static constexpr
bool UseVectorizedDefaultKernel_v =
4720 ( useOptimizedKernels &&
4721 !IsDiagonal_v<T2> &&
4722 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4723 IsSIMDCombinable_v< ElementType_t<T1>
4727 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4728 HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4735 using ForwardFunctor =
If_t< HERM
4751 using This = DMatScalarMultExpr<MMM,ST,true>;
4754 using BaseType = DenseMatrix<This,true>;
4758 , DeclHermTrait< MultTrait_t<RES,ST> >
4760 , DeclSymTrait< MultTrait_t<RES,ST> >
4763 , DeclDiagTrait< MultTrait_t<RES,ST> >
4764 , DeclLowTrait< MultTrait_t<RES,ST> > >
4766 , DeclUppTrait< MultTrait_t<RES,ST> >
4767 , MultTrait<RES,ST> > > > >::Type;
4772 using SIMDType = SIMDTrait_t<ElementType>;
4777 using LeftOperand =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4783 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4786 using RT = If_t< evaluateRight, const RT2, CT2 >;
4792 ( !IsDiagonal_v<MT1> &&
4793 MT1::simdEnabled && MT2::simdEnabled &&
4794 IsSIMDCombinable_v<ET1,ET2,ST> &&
4795 HasSIMDAdd_v<ET1,ET2> &&
4796 HasSIMDMult_v<ET1,ET2> );
4846 if( j >=
matrix_.columns() ) {
4849 return (*
this)(i,j);
4858 inline size_t rows()
const {
4868 inline size_t columns()
const {
4899 template<
typename T >
4900 inline bool canAlias(
const T* alias )
const {
4901 return matrix_.canAlias( alias );
4911 template<
typename T >
4912 inline bool isAliased(
const T* alias )
const {
4913 return matrix_.isAliased( alias );
4934 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4936 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
4937 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
4959 template<
typename MT
4962 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4969 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4970 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4972 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4975 else if( left.columns() == 0UL ) {
4990 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5005 template<
typename MT3
5009 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5011 if( ( IsDiagonal_v<MT4> ) ||
5012 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
5013 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5014 selectSmallAssignKernel( C, A, B, scalar );
5016 selectBlasAssignKernel( C, A, B, scalar );
5034 template<
typename MT3
5038 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5039 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5041 const size_t M( A.rows() );
5042 const size_t N( B.columns() );
5043 const size_t K( A.columns() );
5047 for(
size_t j=0UL; j<N; ++j )
5049 const size_t kbegin( ( IsLower_v<MT5> )
5050 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5052 const size_t kend( ( IsUpper_v<MT5> )
5053 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5057 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5058 for(
size_t i=0UL; i<M; ++i ) {
5065 const size_t ibegin( ( IsLower_v<MT4> )
5066 ?( ( IsStrictlyLower_v<MT4> )
5067 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
5068 :( LOW ?
max(j,kbegin) : kbegin ) )
5069 :( LOW ? j : 0UL ) );
5070 const size_t iend( ( IsUpper_v<MT4> )
5071 ?( ( IsStrictlyUpper_v<MT4> )
5072 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
5073 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
5074 :( UPP ? j+1UL : M ) );
5076 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5077 for(
size_t i=0UL; i<ibegin; ++i ) {
5081 else if( IsStrictlyLower_v<MT4> ) {
5084 for(
size_t i=ibegin; i<iend; ++i ) {
5085 C(i,j) = A(i,kbegin) * B(kbegin,j);
5087 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5088 for(
size_t i=iend; i<M; ++i ) {
5092 else if( IsStrictlyUpper_v<MT4> ) {
5093 reset( C(M-1UL,j) );
5097 for(
size_t k=kbegin+1UL; k<kend; ++k )
5099 const size_t ibegin( ( IsLower_v<MT4> )
5100 ?( ( IsStrictlyLower_v<MT4> )
5101 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
5102 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
5103 :( SYM || HERM || LOW ? j : 0UL ) );
5104 const size_t iend( ( IsUpper_v<MT4> )
5105 ?( ( IsStrictlyUpper_v<MT4> )
5106 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
5107 :( UPP ?
min(j+1UL,k) : k ) )
5108 :( UPP ? j+1UL : M ) );
5110 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5113 for(
size_t i=ibegin; i<iend; ++i ) {
5114 C(i,j) += A(i,k) * B(k,j);
5116 if( IsUpper_v<MT4> ) {
5117 C(iend,j) = A(iend,k) * B(k,j);
5122 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5123 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5124 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5125 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5126 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5127 :( UPP ? j+1UL : M ) );
5129 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5132 for(
size_t i=ibegin; i<iend; ++i ) {
5139 for(
size_t j=1UL; j<N; ++j ) {
5140 for(
size_t i=0UL; i<j; ++i ) {
5141 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5162 template<
typename MT3
5166 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5167 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5171 const size_t M( A.rows() );
5172 const size_t N( B.columns() );
5174 for(
size_t j=0UL; j<N; ++j )
5176 const size_t ibegin( ( IsLower_v<MT4> )
5177 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5179 const size_t iend( ( IsUpper_v<MT4> )
5180 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5184 if( IsLower_v<MT4> ) {
5185 for(
size_t i=0UL; i<ibegin; ++i ) {
5189 for(
size_t i=ibegin; i<iend; ++i ) {
5190 C(i,j) = A(i,j) * B(j,j) * scalar;
5192 if( IsUpper_v<MT4> ) {
5193 for(
size_t i=iend; i<M; ++i ) {
5215 template<
typename MT3
5219 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5220 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5224 const size_t M( A.rows() );
5225 const size_t N( B.columns() );
5227 for(
size_t j=0UL; j<N; ++j )
5229 const size_t ibegin( ( IsLower_v<MT5> )
5230 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5232 const size_t iend( ( IsUpper_v<MT5> )
5233 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5237 if( IsLower_v<MT4> ) {
5238 for(
size_t i=0UL; i<ibegin; ++i ) {
5242 for(
size_t i=ibegin; i<iend; ++i ) {
5243 C(i,j) = A(i,i) * B(i,j) * scalar;
5245 if( IsUpper_v<MT4> ) {
5246 for(
size_t i=iend; i<M; ++i ) {
5268 template<
typename MT3
5272 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5273 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5279 for(
size_t i=0UL; i<A.rows(); ++i ) {
5280 C(i,i) = A(i,i) * B(i,i) * scalar;
5299 template<
typename MT3
5303 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5304 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5306 selectDefaultAssignKernel( C, A, B, scalar );
5325 template<
typename MT3
5329 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5330 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5337 const ForwardFunctor fwd;
5339 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5340 const OppositeType_t<MT5> tmp(
serial( B ) );
5341 assign( C, fwd( A * tmp ) * scalar );
5343 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5344 const OppositeType_t<MT4> tmp(
serial( A ) );
5345 assign( C, fwd( tmp * B ) * scalar );
5347 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5348 const OppositeType_t<MT5> tmp(
serial( B ) );
5349 assign( C, fwd( A * tmp ) * scalar );
5352 const OppositeType_t<MT4> tmp(
serial( A ) );
5353 assign( C, fwd( tmp * B ) * scalar );
5373 template<
typename MT3
5377 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5378 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5380 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5382 const size_t M( A.rows() );
5383 const size_t N( B.columns() );
5384 const size_t K( A.columns() );
5388 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
5391 const SIMDType factor(
set( scalar ) );
5393 if( LOW && UPP && M >
SIMDSIZE*3UL ) {
5400 if( IsIntegral_v<ElementType> )
5402 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
5403 for(
size_t j=0UL; j<N; ++j )
5405 const size_t kbegin( ( IsLower_v<MT5> )
5406 ?( ( IsUpper_v<MT4> )
5407 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5408 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5409 :( IsUpper_v<MT4> ? i : 0UL ) );
5410 const size_t kend( ( IsUpper_v<MT5> )
5411 ?( ( IsLower_v<MT4> )
5412 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5413 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5414 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
5416 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5418 for(
size_t k=kbegin; k<kend; ++k ) {
5419 const SIMDType b1(
set( B(k,j) ) );
5420 xmm1 += A.load(i ,k) * b1;
5421 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5422 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5423 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5424 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5425 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
5426 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
5427 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
5430 C.store( i , j, xmm1 * factor );
5431 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5432 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5433 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5434 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5435 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
5436 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
5437 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
5442 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
5446 for( ; (j+2UL) <= N; j+=2UL )
5448 const size_t kbegin( ( IsLower_v<MT5> )
5449 ?( ( IsUpper_v<MT4> )
5450 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5451 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5452 :( IsUpper_v<MT4> ? i : 0UL ) );
5453 const size_t kend( ( IsUpper_v<MT5> )
5454 ?( ( IsLower_v<MT4> )
5455 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5456 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5457 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
5459 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5461 for(
size_t k=kbegin; k<kend; ++k ) {
5462 const SIMDType a1( A.load(i ,k) );
5463 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5464 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5465 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5466 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
5467 const SIMDType b1(
set( B(k,j ) ) );
5468 const SIMDType b2(
set( B(k,j+1UL) ) );
5481 C.store( i , j , xmm1 * factor );
5482 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5483 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5484 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5485 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
5486 C.store( i , j+1UL, xmm6 * factor );
5487 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
5488 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5489 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5490 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5495 const size_t kbegin( ( IsLower_v<MT5> )
5496 ?( ( IsUpper_v<MT4> )
5497 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5498 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5499 :( IsUpper_v<MT4> ? i : 0UL ) );
5500 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
5502 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5504 for(
size_t k=kbegin; k<kend; ++k ) {
5505 const SIMDType b1(
set( B(k,j) ) );
5506 xmm1 += A.load(i ,k) * b1;
5507 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5508 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5509 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5510 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5513 C.store( i , j, xmm1 * factor );
5514 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5515 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5516 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5517 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5523 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
5524 size_t j( UPP ? i : 0UL );
5526 for( ; (j+2UL) <= jend; j+=2UL )
5528 const size_t kbegin( ( IsLower_v<MT5> )
5529 ?( ( IsUpper_v<MT4> )
5530 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5531 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5532 :( IsUpper_v<MT4> ? i : 0UL ) );
5533 const size_t kend( ( IsUpper_v<MT5> )
5534 ?( ( IsLower_v<MT4> )
5535 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5536 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5537 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
5539 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5541 for(
size_t k=kbegin; k<kend; ++k ) {
5542 const SIMDType a1( A.load(i ,k) );
5543 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5544 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5545 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5546 const SIMDType b1(
set( B(k,j ) ) );
5547 const SIMDType b2(
set( B(k,j+1UL) ) );
5558 C.store( i , j , xmm1 * factor );
5559 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5560 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5561 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5562 C.store( i , j+1UL, xmm5 * factor );
5563 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
5564 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5565 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5570 const size_t kbegin( ( IsLower_v<MT5> )
5571 ?( ( IsUpper_v<MT4> )
5572 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5573 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5574 :( IsUpper_v<MT4> ? i : 0UL ) );
5575 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
5577 SIMDType xmm1, xmm2, xmm3, xmm4;
5579 for(
size_t k=kbegin; k<kend; ++k ) {
5580 const SIMDType b1(
set( B(k,j) ) );
5581 xmm1 += A.load(i ,k) * b1;
5582 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5583 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5584 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5587 C.store( i , j, xmm1 * factor );
5588 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5589 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5590 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5596 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
5597 size_t j( UPP ? i : 0UL );
5599 for( ; (j+2UL) <= jend; j+=2UL )
5601 const size_t kbegin( ( IsLower_v<MT5> )
5602 ?( ( IsUpper_v<MT4> )
5603 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5604 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5605 :( IsUpper_v<MT4> ? i : 0UL ) );
5606 const size_t kend( ( IsUpper_v<MT5> )
5607 ?( ( IsLower_v<MT4> )
5608 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5609 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5610 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
5612 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5614 for(
size_t k=kbegin; k<kend; ++k ) {
5615 const SIMDType a1( A.load(i ,k) );
5616 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5617 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5618 const SIMDType b1(
set( B(k,j ) ) );
5619 const SIMDType b2(
set( B(k,j+1UL) ) );
5628 C.store( i , j , xmm1 * factor );
5629 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5630 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5631 C.store( i , j+1UL, xmm4 * factor );
5632 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
5633 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
5638 const size_t kbegin( ( IsLower_v<MT5> )
5639 ?( ( IsUpper_v<MT4> )
5640 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5641 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5642 :( IsUpper_v<MT4> ? i : 0UL ) );
5643 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
5645 SIMDType xmm1, xmm2, xmm3;
5647 for(
size_t k=kbegin; k<kend; ++k ) {
5648 const SIMDType b1(
set( B(k,j) ) );
5649 xmm1 += A.load(i ,k) * b1;
5650 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5651 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5654 C.store( i , j, xmm1 * factor );
5655 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5656 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5662 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
5663 size_t j( UPP ? i : 0UL );
5665 for( ; (j+4UL) <= jend; j+=4UL )
5667 const size_t kbegin( ( IsLower_v<MT5> )
5668 ?( ( IsUpper_v<MT4> )
5669 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5670 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5671 :( IsUpper_v<MT4> ? i : 0UL ) );
5672 const size_t kend( ( IsUpper_v<MT5> )
5673 ?( ( IsLower_v<MT4> )
5674 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
5675 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
5676 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5678 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5680 for(
size_t k=kbegin; k<kend; ++k ) {
5681 const SIMDType a1( A.load(i ,k) );
5682 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5683 const SIMDType b1(
set( B(k,j ) ) );
5684 const SIMDType b2(
set( B(k,j+1UL) ) );
5685 const SIMDType b3(
set( B(k,j+2UL) ) );
5686 const SIMDType b4(
set( B(k,j+3UL) ) );
5697 C.store( i , j , xmm1 * factor );
5698 C.store( i+
SIMDSIZE, j , xmm2 * factor );
5699 C.store( i , j+1UL, xmm3 * factor );
5700 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
5701 C.store( i , j+2UL, xmm5 * factor );
5702 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
5703 C.store( i , j+3UL, xmm7 * factor );
5704 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
5707 for( ; (j+3UL) <= jend; j+=3UL )
5709 const size_t kbegin( ( IsLower_v<MT5> )
5710 ?( ( IsUpper_v<MT4> )
5711 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5712 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5713 :( IsUpper_v<MT4> ? i : 0UL ) );
5714 const size_t kend( ( IsUpper_v<MT5> )
5715 ?( ( IsLower_v<MT4> )
5716 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
5717 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
5718 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5720 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5722 for(
size_t k=kbegin; k<kend; ++k ) {
5723 const SIMDType a1( A.load(i ,k) );
5724 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5725 const SIMDType b1(
set( B(k,j ) ) );
5726 const SIMDType b2(
set( B(k,j+1UL) ) );
5727 const SIMDType b3(
set( B(k,j+2UL) ) );
5736 C.store( i , j , xmm1 * factor );
5737 C.store( i+
SIMDSIZE, j , xmm2 * factor );
5738 C.store( i , j+1UL, xmm3 * factor );
5739 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
5740 C.store( i , j+2UL, xmm5 * factor );
5741 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
5744 for( ; (j+2UL) <= jend; j+=2UL )
5746 const size_t kbegin( ( IsLower_v<MT5> )
5747 ?( ( IsUpper_v<MT4> )
5748 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5749 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5750 :( IsUpper_v<MT4> ? i : 0UL ) );
5751 const size_t kend( ( IsUpper_v<MT5> )
5752 ?( ( IsLower_v<MT4> )
5753 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5754 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5755 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
5757 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5760 for( ; (k+2UL) <= kend; k+=2UL ) {
5761 const SIMDType a1( A.load(i ,k ) );
5762 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
5763 const SIMDType a3( A.load(i ,k+1UL) );
5764 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
5765 const SIMDType b1(
set( B(k ,j ) ) );
5766 const SIMDType b2(
set( B(k ,j+1UL) ) );
5767 const SIMDType b3(
set( B(k+1UL,j ) ) );
5768 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
5779 for( ; k<kend; ++k ) {
5780 const SIMDType a1( A.load(i ,k) );
5781 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
5782 const SIMDType b1(
set( B(k,j ) ) );
5783 const SIMDType b2(
set( B(k,j+1UL) ) );
5790 C.store( i , j , (xmm1+xmm5) * factor );
5791 C.store( i+
SIMDSIZE, j , (xmm2+xmm6) * factor );
5792 C.store( i , j+1UL, (xmm3+xmm7) * factor );
5793 C.store( i+
SIMDSIZE, j+1UL, (xmm4+xmm8) * factor );
5798 const size_t kbegin( ( IsLower_v<MT5> )
5799 ?( ( IsUpper_v<MT4> )
5800 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5801 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5802 :( IsUpper_v<MT4> ? i : 0UL ) );
5803 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
5805 SIMDType xmm1, xmm2, xmm3, xmm4;
5808 for( ; (k+2UL) <= kend; k+=2UL ) {
5809 const SIMDType b1(
set( B(k ,j) ) );
5810 const SIMDType b2(
set( B(k+1UL,j) ) );
5811 xmm1 += A.load(i ,k ) * b1;
5812 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
5813 xmm3 += A.load(i ,k+1UL) * b2;
5814 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
5817 for( ; k<kend; ++k ) {
5818 const SIMDType b1(
set( B(k,j) ) );
5819 xmm1 += A.load(i ,k) * b1;
5823 C.store( i , j, (xmm1+xmm3) * factor );
5824 C.store( i+
SIMDSIZE, j, (xmm2+xmm4) * factor );
5830 const size_t jend( SYM || HERM || LOW ?
min(i+
SIMDSIZE,N) : N );
5831 size_t j( UPP ? i : 0UL );
5833 for( ; (j+4UL) <= jend; j+=4UL )
5835 const size_t kbegin( ( IsLower_v<MT5> )
5836 ?( ( IsUpper_v<MT4> )
5837 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5838 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5839 :( IsUpper_v<MT4> ? i : 0UL ) );
5840 const size_t kend( ( IsUpper_v<MT5> )
5841 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
5844 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5847 for( ; (k+2UL) <= kend; k+=2UL ) {
5848 const SIMDType a1( A.load(i,k ) );
5849 const SIMDType a2( A.load(i,k+1UL) );
5850 xmm1 += a1 *
set( B(k ,j ) );
5851 xmm2 += a1 *
set( B(k ,j+1UL) );
5852 xmm3 += a1 *
set( B(k ,j+2UL) );
5853 xmm4 += a1 *
set( B(k ,j+3UL) );
5854 xmm5 += a2 *
set( B(k+1UL,j ) );
5855 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
5856 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
5857 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
5860 for( ; k<kend; ++k ) {
5861 const SIMDType a1( A.load(i,k) );
5862 xmm1 += a1 *
set( B(k,j ) );
5863 xmm2 += a1 *
set( B(k,j+1UL) );
5864 xmm3 += a1 *
set( B(k,j+2UL) );
5865 xmm4 += a1 *
set( B(k,j+3UL) );
5868 C.store( i, j , (xmm1+xmm5) * factor );
5869 C.store( i, j+1UL, (xmm2+xmm6) * factor );
5870 C.store( i, j+2UL, (xmm3+xmm7) * factor );
5871 C.store( i, j+3UL, (xmm4+xmm8) * factor );
5874 for( ; (j+3UL) <= jend; j+=3UL )
5876 const size_t kbegin( ( IsLower_v<MT5> )
5877 ?( ( IsUpper_v<MT4> )
5878 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5879 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5880 :( IsUpper_v<MT4> ? i : 0UL ) );
5881 const size_t kend( ( IsUpper_v<MT5> )
5882 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
5885 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5888 for( ; (k+2UL) <= kend; k+=2UL ) {
5889 const SIMDType a1( A.load(i,k ) );
5890 const SIMDType a2( A.load(i,k+1UL) );
5891 xmm1 += a1 *
set( B(k ,j ) );
5892 xmm2 += a1 *
set( B(k ,j+1UL) );
5893 xmm3 += a1 *
set( B(k ,j+2UL) );
5894 xmm4 += a2 *
set( B(k+1UL,j ) );
5895 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
5896 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
5899 for( ; k<kend; ++k ) {
5900 const SIMDType a1( A.load(i,k) );
5901 xmm1 += a1 *
set( B(k,j ) );
5902 xmm2 += a1 *
set( B(k,j+1UL) );
5903 xmm3 += a1 *
set( B(k,j+2UL) );
5906 C.store( i, j , (xmm1+xmm4) * factor );
5907 C.store( i, j+1UL, (xmm2+xmm5) * factor );
5908 C.store( i, j+2UL, (xmm3+xmm6) * factor );
5911 for( ; (j+2UL) <= jend; j+=2UL )
5913 const size_t kbegin( ( IsLower_v<MT5> )
5914 ?( ( IsUpper_v<MT4> )
5915 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5916 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5917 :( IsUpper_v<MT4> ? i : 0UL ) );
5918 const size_t kend( ( IsUpper_v<MT5> )
5919 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
5922 SIMDType xmm1, xmm2, xmm3, xmm4;
5925 for( ; k<kend; ++k ) {
5926 const SIMDType a1( A.load(i,k) );
5927 xmm1 += a1 *
set( B(k,j ) );
5928 xmm2 += a1 *
set( B(k,j+1UL) );
5931 for( ; (k+2UL) <= kend; k+=2UL ) {
5932 const SIMDType a1( A.load(i,k ) );
5933 const SIMDType a2( A.load(i,k+1UL) );
5934 xmm1 += a1 *
set( B(k ,j ) );
5935 xmm2 += a1 *
set( B(k ,j+1UL) );
5936 xmm3 += a2 *
set( B(k+1UL,j ) );
5937 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
5940 C.store( i, j , (xmm1+xmm3) * factor );
5941 C.store( i, j+1UL, (xmm2+xmm4) * factor );
5946 const size_t kbegin( ( IsLower_v<MT5> )
5947 ?( ( IsUpper_v<MT4> )
5948 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5949 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5950 :( IsUpper_v<MT4> ? i : 0UL ) );
5952 SIMDType xmm1, xmm2;
5955 for( ; (k+2UL) <= K; k+=2UL ) {
5956 xmm1 += A.load(i,k ) *
set( B(k ,j) );
5957 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
5961 xmm1 += A.load(i,k) *
set( B(k,j) );
5964 C.store( i, j, (xmm1+xmm2) * factor );
5968 for( ; remainder && i<M; ++i )
5970 size_t j( LOW && UPP ? i : 0UL );
5972 for( ; (j+2UL) <= N; j+=2UL )
5974 const size_t kbegin( ( IsLower_v<MT5> )
5975 ?( ( IsUpper_v<MT4> )
5976 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5977 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5978 :( IsUpper_v<MT4> ? i : 0UL ) );
5979 const size_t kend( ( IsUpper_v<MT5> )
5980 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
5986 for(
size_t k=kbegin; k<kend; ++k ) {
5987 value1 += A(i,k) * B(k,j );
5988 value2 += A(i,k) * B(k,j+1UL);
5991 C(i,j ) = value1 * scalar;
5992 C(i,j+1UL) = value2 * scalar;
5997 const size_t kbegin( ( IsLower_v<MT5> )
5998 ?( ( IsUpper_v<MT4> )
5999 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6000 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6001 :( IsUpper_v<MT4> ? i : 0UL ) );
6005 for(
size_t k=kbegin; k<K; ++k ) {
6006 value += A(i,k) * B(k,j);
6009 C(i,j) = value * scalar;
6014 if( ( SYM || HERM ) && ( M >
SIMDSIZE*4UL ) ) {
6015 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
6017 for(
size_t i=0UL; i<iend; ++i ) {
6018 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
6022 else if( LOW && !UPP && M >
SIMDSIZE*4UL ) {
6023 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
6025 for(
size_t i=0UL; i<iend; ++i ) {
6030 else if( !LOW && UPP && M >
SIMDSIZE*4UL ) {
6031 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
6033 for(
size_t j=0UL; j<jend; ++j ) {
6055 template<
typename MT3
6059 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6060 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6062 selectDefaultAssignKernel( C, A, B, scalar );
6081 template<
typename MT3
6085 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6086 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6089 smmm( C, A, B, scalar );
6091 hmmm( C, A, B, scalar );
6093 lmmm( C, A, B, scalar, ST2(0) );
6095 ummm( C, A, B, scalar, ST2(0) );
6097 mmm( C, A, B, scalar, ST2(0) );
6115 template<
typename MT3
6119 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6120 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6122 selectLargeAssignKernel( C, A, B, scalar );
6127 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6141 template<
typename MT3
6145 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6146 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6148 using ET = ElementType_t<MT3>;
6150 if( IsTriangular_v<MT4> ) {
6152 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6154 else if( IsTriangular_v<MT5> ) {
6156 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6159 gemm( C, A, B,
ET(scalar),
ET(0) );
6177 template<
typename MT
6180 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6184 using TmpType = If_t< SO, ResultType, OppositeType >;
6196 const ForwardFunctor fwd;
6198 const TmpType tmp(
serial( rhs ) );
6199 assign( ~lhs, fwd( tmp ) );
6217 template<
typename MT >
6219 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6228 const ForwardFunctor fwd;
6230 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6231 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6233 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6234 assign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
6235 else if( IsSymmetric_v<MT1> )
6236 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
6238 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
6254 template<
typename MT
6256 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6257 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6264 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6265 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6267 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6281 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6296 template<
typename MT3
6300 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6302 if( ( IsDiagonal_v<MT4> ) ||
6303 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
6304 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6305 selectSmallAddAssignKernel( C, A, B, scalar );
6307 selectBlasAddAssignKernel( C, A, B, scalar );
6325 template<
typename MT3
6329 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6330 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6333 addAssign( C, tmp );
6351 template<
typename MT3
6355 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6356 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6360 const size_t M( A.rows() );
6361 const size_t N( B.columns() );
6363 for(
size_t j=0UL; j<N; ++j )
6365 const size_t ibegin( ( IsLower_v<MT4> )
6366 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6368 const size_t iend( ( IsUpper_v<MT4> )
6369 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6373 const size_t inum( iend - ibegin );
6374 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6376 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6377 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6378 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6381 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6401 template<
typename MT3
6405 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6406 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6410 const size_t M( A.rows() );
6411 const size_t N( B.columns() );
6413 for(
size_t j=0UL; j<N; ++j )
6415 const size_t ibegin( ( IsLower_v<MT5> )
6416 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6418 const size_t iend( ( IsUpper_v<MT5> )
6419 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6423 const size_t inum( iend - ibegin );
6424 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6426 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6427 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6428 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6431 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6451 template<
typename MT3
6455 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6456 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6460 for(
size_t i=0UL; i<A.rows(); ++i ) {
6461 C(i,i) += A(i,i) * B(i,i) * scalar;
6480 template<
typename MT3
6484 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6485 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6487 selectDefaultAddAssignKernel( C, A, B, scalar );
6506 template<
typename MT3
6510 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6511 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6518 const ForwardFunctor fwd;
6520 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6521 const OppositeType_t<MT5> tmp(
serial( B ) );
6522 addAssign( C, fwd( A * tmp ) * scalar );
6524 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6525 const OppositeType_t<MT4> tmp(
serial( A ) );
6526 addAssign( C, fwd( tmp * B ) * scalar );
6528 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
6529 const OppositeType_t<MT5> tmp(
serial( B ) );
6530 addAssign( C, fwd( A * tmp ) * scalar );
6533 const OppositeType_t<MT4> tmp(
serial( A ) );
6534 addAssign( C, fwd( tmp * B ) * scalar );
6554 template<
typename MT3
6558 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6559 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6561 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
6563 const size_t M( A.rows() );
6564 const size_t N( B.columns() );
6565 const size_t K( A.columns() );
6569 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
6572 const SIMDType factor(
set( scalar ) );
6576 if( IsIntegral_v<ElementType> )
6579 for(
size_t j=0UL; j<N; ++j )
6581 const size_t kbegin( ( IsLower_v<MT5> )
6582 ?( ( IsUpper_v<MT4> )
6583 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6584 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6585 :( IsUpper_v<MT4> ? i : 0UL ) );
6586 const size_t kend( ( IsUpper_v<MT5> )
6587 ?( ( IsLower_v<MT4> )
6588 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
6589 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
6590 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
6592 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6594 for(
size_t k=kbegin; k<kend; ++k ) {
6595 const SIMDType b1(
set( B(k,j) ) );
6596 xmm1 += A.load(i ,k) * b1;
6597 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6598 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6599 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6600 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
6601 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
6602 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
6603 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
6606 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6622 for( ; (j+2UL) <= N; j+=2UL )
6624 const size_t kbegin( ( IsLower_v<MT5> )
6625 ?( ( IsUpper_v<MT4> )
6626 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6627 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6628 :( IsUpper_v<MT4> ? i : 0UL ) );
6629 const size_t kend( ( IsUpper_v<MT5> )
6630 ?( ( IsLower_v<MT4> )
6631 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6632 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6633 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
6635 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6637 for(
size_t k=kbegin; k<kend; ++k ) {
6638 const SIMDType a1( A.load(i ,k) );
6639 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6640 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6641 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
6642 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
6643 const SIMDType b1(
set( B(k,j ) ) );
6644 const SIMDType b2(
set( B(k,j+1UL) ) );
6657 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6662 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
6664 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
6665 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
6666 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
6671 const size_t kbegin( ( IsLower_v<MT5> )
6672 ?( ( IsUpper_v<MT4> )
6673 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6674 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6675 :( IsUpper_v<MT4> ? i : 0UL ) );
6676 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
6678 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6680 for(
size_t k=kbegin; k<kend; ++k ) {
6681 const SIMDType b1(
set( B(k,j) ) );
6682 xmm1 += A.load(i ,k) * b1;
6683 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6684 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6685 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6686 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
6689 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6701 for( ; (j+2UL) <= N; j+=2UL )
6703 const size_t kbegin( ( IsLower_v<MT5> )
6704 ?( ( IsUpper_v<MT4> )
6705 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6706 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6707 :( IsUpper_v<MT4> ? i : 0UL ) );
6708 const size_t kend( ( IsUpper_v<MT5> )
6709 ?( ( IsLower_v<MT4> )
6710 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6711 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6712 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
6714 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6716 for(
size_t k=kbegin; k<kend; ++k ) {
6717 const SIMDType a1( A.load(i ,k) );
6718 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6719 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6720 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
6721 const SIMDType b1(
set( B(k,j ) ) );
6722 const SIMDType b2(
set( B(k,j+1UL) ) );
6733 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6737 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
6739 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
6740 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
6745 const size_t kbegin( ( IsLower_v<MT5> )
6746 ?( ( IsUpper_v<MT4> )
6747 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6748 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6749 :( IsUpper_v<MT4> ? i : 0UL ) );
6750 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
6752 SIMDType xmm1, xmm2, xmm3, xmm4;
6754 for(
size_t k=kbegin; k<kend; ++k ) {
6755 const SIMDType b1(
set( B(k,j) ) );
6756 xmm1 += A.load(i ,k) * b1;
6757 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6758 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6759 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
6762 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6773 for( ; (j+2UL) <= N; j+=2UL )
6775 const size_t kbegin( ( IsLower_v<MT5> )
6776 ?( ( IsUpper_v<MT4> )
6777 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6778 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6779 :( IsUpper_v<MT4> ? i : 0UL ) );
6780 const size_t kend( ( IsUpper_v<MT5> )
6781 ?( ( IsLower_v<MT4> )
6782 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6783 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6784 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
6786 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6788 for(
size_t k=kbegin; k<kend; ++k ) {
6789 const SIMDType a1( A.load(i ,k) );
6790 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
6791 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
6792 const SIMDType b1(
set( B(k,j ) ) );
6793 const SIMDType b2(
set( B(k,j+1UL) ) );
6802 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6805 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
6807 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
6812 const size_t kbegin( ( IsLower_v<MT5> )
6813 ?( ( IsUpper_v<MT4> )
6814 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6815 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6816 :( IsUpper_v<MT4> ? i : 0UL ) );
6817 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
6819 SIMDType xmm1, xmm2, xmm3;
6821 for(
size_t k=kbegin; k<kend; ++k ) {
6822 const SIMDType b1(
set( B(k,j) ) );
6823 xmm1 += A.load(i ,k) * b1;
6824 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6825 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6828 C.store( i , j, C.load(i ,j) + xmm1 * factor );
6836 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
6837 size_t j( UPP ? i : 0UL );
6839 for( ; (j+4UL) <= jend; j+=4UL )
6841 const size_t kbegin( ( IsLower_v<MT5> )
6842 ?( ( IsUpper_v<MT4> )
6843 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6844 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6845 :( IsUpper_v<MT4> ? i : 0UL ) );
6846 const size_t kend( ( IsUpper_v<MT5> )
6847 ?( ( IsLower_v<MT4> )
6848 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6849 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6850 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6852 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6854 for(
size_t k=kbegin; k<kend; ++k ) {
6855 const SIMDType a1( A.load(i ,k) );
6856 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
6857 const SIMDType b1(
set( B(k,j ) ) );
6858 const SIMDType b2(
set( B(k,j+1UL) ) );
6859 const SIMDType b3(
set( B(k,j+2UL) ) );
6860 const SIMDType b4(
set( B(k,j+3UL) ) );
6871 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6873 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
6875 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
6877 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
6881 for( ; (j+3UL) <= jend; j+=3UL )
6883 const size_t kbegin( ( IsLower_v<MT5> )
6884 ?( ( IsUpper_v<MT4> )
6885 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6886 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6887 :( IsUpper_v<MT4> ? i : 0UL ) );
6888 const size_t kend( ( IsUpper_v<MT5> )
6889 ?( ( IsLower_v<MT4> )
6890 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6891 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6892 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6894 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6896 for(
size_t k=kbegin; k<kend; ++k ) {
6897 const SIMDType a1( A.load(i ,k) );
6898 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
6899 const SIMDType b1(
set( B(k,j ) ) );
6900 const SIMDType b2(
set( B(k,j+1UL) ) );
6901 const SIMDType b3(
set( B(k,j+2UL) ) );
6910 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6912 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
6914 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
6918 for( ; (j+2UL) <= jend; j+=2UL )
6920 const size_t kbegin( ( IsLower_v<MT5> )
6921 ?( ( IsUpper_v<MT4> )
6922 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6923 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6924 :( IsUpper_v<MT4> ? i : 0UL ) );
6925 const size_t kend( ( IsUpper_v<MT5> )
6926 ?( ( IsLower_v<MT4> )
6927 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6928 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6929 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6931 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6934 for( ; (k+2UL) <= kend; k+=2UL ) {
6935 const SIMDType a1( A.load(i ,k ) );
6936 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
6937 const SIMDType a3( A.load(i ,k+1UL) );
6938 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
6939 const SIMDType b1(
set( B(k ,j ) ) );
6940 const SIMDType b2(
set( B(k ,j+1UL) ) );
6941 const SIMDType b3(
set( B(k+1UL,j ) ) );
6942 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
6953 for( ; k<kend; ++k ) {
6954 const SIMDType a1( A.load(i ,k) );
6955 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
6956 const SIMDType b1(
set( B(k,j ) ) );
6957 const SIMDType b2(
set( B(k,j+1UL) ) );
6964 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
6966 C.store( i , j+1UL, C.load(i ,j+1UL) + (xmm3+xmm7) * factor );
6967 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) + (xmm4+xmm8) * factor );
6972 const size_t kbegin( ( IsLower_v<MT5> )
6973 ?( ( IsUpper_v<MT4> )
6974 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6975 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6976 :( IsUpper_v<MT4> ? i : 0UL ) );
6977 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
6979 SIMDType xmm1, xmm2, xmm3, xmm4;
6982 for( ; (k+2UL) <= kend; k+=2UL ) {
6983 const SIMDType b1(
set( B(k ,j) ) );
6984 const SIMDType b2(
set( B(k+1UL,j) ) );
6985 xmm1 += A.load(i ,k ) * b1;
6986 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
6987 xmm3 += A.load(i ,k+1UL) * b2;
6988 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
6991 for( ; k<kend; ++k ) {
6992 const SIMDType b1(
set( B(k,j) ) );
6993 xmm1 += A.load(i ,k) * b1;
6997 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7004 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
7005 size_t j( UPP ? i : 0UL );
7007 for( ; (j+4UL) <= jend; j+=4UL )
7009 const size_t kbegin( ( IsLower_v<MT5> )
7010 ?( ( IsUpper_v<MT4> )
7011 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7012 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7013 :( IsUpper_v<MT4> ? i : 0UL ) );
7014 const size_t kend( ( IsUpper_v<MT5> )
7015 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7018 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7021 for( ; (k+2UL) <= kend; k+=2UL ) {
7022 const SIMDType a1( A.load(i,k ) );
7023 const SIMDType a2( A.load(i,k+1UL) );
7024 xmm1 += a1 *
set( B(k ,j ) );
7025 xmm2 += a1 *
set( B(k ,j+1UL) );
7026 xmm3 += a1 *
set( B(k ,j+2UL) );
7027 xmm4 += a1 *
set( B(k ,j+3UL) );
7028 xmm5 += a2 *
set( B(k+1UL,j ) );
7029 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
7030 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
7031 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
7034 for( ; k<kend; ++k ) {
7035 const SIMDType a1( A.load(i,k) );
7036 xmm1 += a1 *
set( B(k,j ) );
7037 xmm2 += a1 *
set( B(k,j+1UL) );
7038 xmm3 += a1 *
set( B(k,j+2UL) );
7039 xmm4 += a1 *
set( B(k,j+3UL) );
7042 C.store( i, j , C.load(i,j ) + (xmm1+xmm5) * factor );
7043 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm6) * factor );
7044 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm7) * factor );
7045 C.store( i, j+3UL, C.load(i,j+3UL) + (xmm4+xmm8) * factor );
7048 for( ; (j+3UL) <= jend; j+=3UL )
7050 const size_t kbegin( ( IsLower_v<MT5> )
7051 ?( ( IsUpper_v<MT4> )
7052 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7053 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7054 :( IsUpper_v<MT4> ? i : 0UL ) );
7055 const size_t kend( ( IsUpper_v<MT5> )
7056 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7059 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7062 for( ; (k+2UL) <= kend; k+=2UL ) {
7063 const SIMDType a1( A.load(i,k ) );
7064 const SIMDType a2( A.load(i,k+1UL) );
7065 xmm1 += a1 *
set( B(k ,j ) );
7066 xmm2 += a1 *
set( B(k ,j+1UL) );
7067 xmm3 += a1 *
set( B(k ,j+2UL) );
7068 xmm4 += a2 *
set( B(k+1UL,j ) );
7069 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
7070 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
7073 for( ; k<kend; ++k ) {
7074 const SIMDType a1( A.load(i,k) );
7075 xmm1 += a1 *
set( B(k,j ) );
7076 xmm2 += a1 *
set( B(k,j+1UL) );
7077 xmm3 += a1 *
set( B(k,j+2UL) );
7080 C.store( i, j , C.load(i,j ) + (xmm1+xmm4) * factor );
7081 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm5) * factor );
7082 C.store( i, j+2UL, C.load(i,j+2UL) + (xmm3+xmm6) * factor );
7085 for( ; (j+2UL) <= jend; j+=2UL )
7087 const size_t kbegin( ( IsLower_v<MT5> )
7088 ?( ( IsUpper_v<MT4> )
7089 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7090 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7091 :( IsUpper_v<MT4> ? i : 0UL ) );
7092 const size_t kend( ( IsUpper_v<MT5> )
7093 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7096 SIMDType xmm1, xmm2, xmm3, xmm4;
7099 for( ; (k+2UL) <= kend; k+=2UL ) {
7100 const SIMDType a1( A.load(i,k ) );
7101 const SIMDType a2( A.load(i,k+1UL) );
7102 xmm1 += a1 *
set( B(k ,j ) );
7103 xmm2 += a1 *
set( B(k ,j+1UL) );
7104 xmm3 += a2 *
set( B(k+1UL,j ) );
7105 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
7108 for( ; k<kend; ++k ) {
7109 const SIMDType a1( A.load(i,k) );
7110 xmm1 += a1 *
set( B(k,j ) );
7111 xmm2 += a1 *
set( B(k,j+1UL) );
7114 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7115 C.store( i, j+1UL, C.load(i,j+1UL) + (xmm2+xmm4) * factor );
7120 const size_t kbegin( ( IsLower_v<MT5> )
7121 ?( ( IsUpper_v<MT4> )
7122 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7123 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7124 :( IsUpper_v<MT4> ? i : 0UL ) );
7126 SIMDType xmm1, xmm2;
7129 for( ; (k+2UL) <= K; k+=2UL ) {
7130 xmm1 += A.load(i,k ) *
set( B(k ,j) );
7131 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
7135 xmm1 += A.load(i,k) *
set( B(k,j) );
7138 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7142 for( ; remainder && i<M; ++i )
7144 const size_t jend( LOW ? i+1UL : N );
7145 size_t j( UPP ? i : 0UL );
7147 for( ; (j+2UL) <= jend; j+=2UL )
7149 const size_t kbegin( ( IsLower_v<MT5> )
7150 ?( ( IsUpper_v<MT4> )
7151 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7152 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7153 :( IsUpper_v<MT4> ? i : 0UL ) );
7154 const size_t kend( ( IsUpper_v<MT5> )
7155 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7161 for(
size_t k=kbegin; k<kend; ++k ) {
7162 value1 += A(i,k) * B(k,j );
7163 value2 += A(i,k) * B(k,j+1UL);
7166 C(i,j ) += value1 * scalar;
7167 C(i,j+1UL) += value2 * scalar;
7172 const size_t kbegin( ( IsLower_v<MT5> )
7173 ?( ( IsUpper_v<MT4> )
7174 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7175 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7176 :( IsUpper_v<MT4> ? i : 0UL ) );
7180 for(
size_t k=kbegin; k<K; ++k ) {
7181 value += A(i,k) * B(k,j);
7184 C(i,j) += value * scalar;
7204 template<
typename MT3
7208 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7209 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7211 selectDefaultAddAssignKernel( C, A, B, scalar );
7230 template<
typename MT3
7234 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7235 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7238 lmmm( C, A, B, scalar, ST2(1) );
7240 ummm( C, A, B, scalar, ST2(1) );
7242 mmm( C, A, B, scalar, ST2(1) );
7261 template<
typename MT3
7265 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7266 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7268 selectLargeAddAssignKernel( C, A, B, scalar );
7273 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7287 template<
typename MT3
7291 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7292 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7294 using ET = ElementType_t<MT3>;
7296 if( IsTriangular_v<MT4> ) {
7297 ResultType_t<MT3> tmp(
serial( B ) );
7298 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7299 addAssign( C, tmp );
7301 else if( IsTriangular_v<MT5> ) {
7302 ResultType_t<MT3> tmp(
serial( A ) );
7303 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7304 addAssign( C, tmp );
7307 gemm( C, A, B,
ET(scalar),
ET(1) );
7328 template<
typename MT >
7330 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7339 const ForwardFunctor fwd;
7341 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7342 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7344 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7345 addAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
7346 else if( IsSymmetric_v<MT1> )
7347 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
7349 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
7369 template<
typename MT
7371 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7372 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7379 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7380 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7382 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7396 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7411 template<
typename MT3
7415 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7417 if( ( IsDiagonal_v<MT4> ) ||
7418 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
7419 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
7420 selectSmallSubAssignKernel( C, A, B, scalar );
7422 selectBlasSubAssignKernel( C, A, B, scalar );
7440 template<
typename MT3
7444 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7445 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7448 subAssign( C, tmp );
7466 template<
typename MT3
7470 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7471 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7475 const size_t M( A.rows() );
7476 const size_t N( B.columns() );
7478 for(
size_t j=0UL; j<N; ++j )
7480 const size_t ibegin( ( IsLower_v<MT4> )
7481 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
7483 const size_t iend( ( IsUpper_v<MT4> )
7484 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
7488 const size_t inum( iend - ibegin );
7489 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7491 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7492 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
7493 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
7496 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
7516 template<
typename MT3
7520 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7521 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7525 const size_t M( A.rows() );
7526 const size_t N( B.columns() );
7528 for(
size_t j=0UL; j<N; ++j )
7530 const size_t ibegin( ( IsLower_v<MT5> )
7531 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7533 const size_t iend( ( IsUpper_v<MT5> )
7534 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7538 const size_t inum( iend - ibegin );
7539 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7541 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7542 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7543 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7546 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7566 template<
typename MT3
7570 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7571 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7575 for(
size_t i=0UL; i<A.rows(); ++i ) {
7576 C(i,i) -= A(i,i) * B(i,i) * scalar;
7595 template<
typename MT3
7599 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7600 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7602 selectDefaultSubAssignKernel( C, A, B, scalar );
7621 template<
typename MT3
7625 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7626 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7633 const ForwardFunctor fwd;
7635 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7636 const OppositeType_t<MT5> tmp(
serial( B ) );
7637 subAssign( C, fwd( A * tmp ) * scalar );
7639 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7640 const OppositeType_t<MT4> tmp(
serial( A ) );
7641 subAssign( C, fwd( tmp * B ) * scalar );
7643 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7644 const OppositeType_t<MT5> tmp(
serial( B ) );
7645 subAssign( C, fwd( A * tmp ) * scalar );
7648 const OppositeType_t<MT4> tmp(
serial( A ) );
7649 subAssign( C, fwd( tmp * B ) * scalar );
7669 template<
typename MT3
7673 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7674 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7676 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7678 const size_t M( A.rows() );
7679 const size_t N( B.columns() );
7680 const size_t K( A.columns() );
7684 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
7687 const SIMDType factor(
set( scalar ) );
7691 if( IsIntegral_v<ElementType> )
7694 for(
size_t j=0UL; j<N; ++j )
7696 const size_t kbegin( ( IsLower_v<MT5> )
7697 ?( ( IsUpper_v<MT4> )
7698 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7699 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7700 :( IsUpper_v<MT4> ? i : 0UL ) );
7701 const size_t kend( ( IsUpper_v<MT5> )
7702 ?( ( IsLower_v<MT4> )
7703 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7704 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7705 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
7707 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7709 for(
size_t k=kbegin; k<kend; ++k ) {
7710 const SIMDType b1(
set( B(k,j) ) );
7711 xmm1 += A.load(i ,k) * b1;
7712 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7713 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7714 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7715 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7716 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
7717 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
7718 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
7721 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7737 for( ; (j+2UL) <= N; j+=2UL )
7739 const size_t kbegin( ( IsLower_v<MT5> )
7740 ?( ( IsUpper_v<MT4> )
7741 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7742 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7743 :( IsUpper_v<MT4> ? i : 0UL ) );
7744 const size_t kend( ( IsUpper_v<MT5> )
7745 ?( ( IsLower_v<MT4> )
7746 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7747 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7748 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
7750 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7752 for(
size_t k=kbegin; k<kend; ++k ) {
7753 const SIMDType a1( A.load(i ,k) );
7754 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7755 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7756 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
7757 const SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
7758 const SIMDType b1(
set( B(k,j ) ) );
7759 const SIMDType b2(
set( B(k,j+1UL) ) );
7772 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7777 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
7779 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
7780 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
7781 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
7786 const size_t kbegin( ( IsLower_v<MT5> )
7787 ?( ( IsUpper_v<MT4> )
7788 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7789 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7790 :( IsUpper_v<MT4> ? i : 0UL ) );
7791 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
7793 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7795 for(
size_t k=kbegin; k<kend; ++k ) {
7796 const SIMDType b1(
set( B(k,j) ) );
7797 xmm1 += A.load(i ,k) * b1;
7798 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7799 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7800 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7801 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7804 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7816 for( ; (j+2UL) <= N; j+=2UL )
7818 const size_t kbegin( ( IsLower_v<MT5> )
7819 ?( ( IsUpper_v<MT4> )
7820 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7821 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7822 :( IsUpper_v<MT4> ? i : 0UL ) );
7823 const size_t kend( ( IsUpper_v<MT5> )
7824 ?( ( IsLower_v<MT4> )
7825 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7826 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7827 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
7829 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7831 for(
size_t k=kbegin; k<kend; ++k ) {
7832 const SIMDType a1( A.load(i ,k) );
7833 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7834 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7835 const SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
7836 const SIMDType b1(
set( B(k,j ) ) );
7837 const SIMDType b2(
set( B(k,j+1UL) ) );
7848 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7852 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
7854 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
7855 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
7860 const size_t kbegin( ( IsLower_v<MT5> )
7861 ?( ( IsUpper_v<MT4> )
7862 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7863 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7864 :( IsUpper_v<MT4> ? i : 0UL ) );
7865 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
7867 SIMDType xmm1, xmm2, xmm3, xmm4;
7869 for(
size_t k=kbegin; k<kend; ++k ) {
7870 const SIMDType b1(
set( B(k,j) ) );
7871 xmm1 += A.load(i ,k) * b1;
7872 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7873 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7874 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7877 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7888 for( ; (j+2UL) <= N; j+=2UL )
7890 const size_t kbegin( ( IsLower_v<MT5> )
7891 ?( ( IsUpper_v<MT4> )
7892 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7893 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7894 :( IsUpper_v<MT4> ? i : 0UL ) );
7895 const size_t kend( ( IsUpper_v<MT5> )
7896 ?( ( IsLower_v<MT4> )
7897 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7898 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7899 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
7901 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7903 for(
size_t k=kbegin; k<kend; ++k ) {
7904 const SIMDType a1( A.load(i ,k) );
7905 const SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7906 const SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7907 const SIMDType b1(
set( B(k,j ) ) );
7908 const SIMDType b2(
set( B(k,j+1UL) ) );
7917 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7920 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
7922 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
7927 const size_t kbegin( ( IsLower_v<MT5> )
7928 ?( ( IsUpper_v<MT4> )
7929 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7930 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7931 :( IsUpper_v<MT4> ? i : 0UL ) );
7932 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
7934 SIMDType xmm1, xmm2, xmm3;
7936 for(
size_t k=kbegin; k<kend; ++k ) {
7937 const SIMDType b1(
set( B(k,j) ) );
7938 xmm1 += A.load(i ,k) * b1;
7939 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7940 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7943 C.store( i , j, C.load(i ,j) - xmm1 * factor );
7951 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
7952 size_t j( UPP ? i : 0UL );
7954 for( ; (j+4UL) <= jend; j+=4UL )
7956 const size_t kbegin( ( IsLower_v<MT5> )
7957 ?( ( IsUpper_v<MT4> )
7958 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7959 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7960 :( IsUpper_v<MT4> ? i : 0UL ) );
7961 const size_t kend( ( IsUpper_v<MT5> )
7962 ?( ( IsLower_v<MT4> )
7963 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7964 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7965 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7967 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7969 for(
size_t k=kbegin; k<kend; ++k ) {
7970 const SIMDType a1( A.load(i ,k) );
7971 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
7972 const SIMDType b1(
set( B(k,j ) ) );
7973 const SIMDType b2(
set( B(k,j+1UL) ) );
7974 const SIMDType b3(
set( B(k,j+2UL) ) );
7975 const SIMDType b4(
set( B(k,j+3UL) ) );
7986 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7988 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
7990 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
7992 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
7996 for( ; (j+3UL) <= jend; j+=3UL )
7998 const size_t kbegin( ( IsLower_v<MT5> )
7999 ?( ( IsUpper_v<MT4> )
8000 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8001 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8002 :( IsUpper_v<MT4> ? i : 0UL ) );
8003 const size_t kend( ( IsUpper_v<MT5> )
8004 ?( ( IsLower_v<MT4> )
8005 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8006 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8007 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8009 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8011 for(
size_t k=kbegin; k<kend; ++k ) {
8012 const SIMDType a1( A.load(i ,k) );
8013 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8014 const SIMDType b1(
set( B(k,j ) ) );
8015 const SIMDType b2(
set( B(k,j+1UL) ) );
8016 const SIMDType b3(
set( B(k,j+2UL) ) );
8025 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8027 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8029 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8033 for( ; (j+2UL) <= jend; j+=2UL )
8035 const size_t kbegin( ( IsLower_v<MT5> )
8036 ?( ( IsUpper_v<MT4> )
8037 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8038 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8039 :( IsUpper_v<MT4> ? i : 0UL ) );
8040 const size_t kend( ( IsUpper_v<MT5> )
8041 ?( ( IsLower_v<MT4> )
8042 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8043 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8044 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8046 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8049 for( ; (k+2UL) <= kend; k+=2UL ) {
8050 const SIMDType a1( A.load(i ,k ) );
8051 const SIMDType a2( A.load(i+
SIMDSIZE,k ) );
8052 const SIMDType a3( A.load(i ,k+1UL) );
8053 const SIMDType a4( A.load(i+
SIMDSIZE,k+1UL) );
8054 const SIMDType b1(
set( B(k ,j ) ) );
8055 const SIMDType b2(
set( B(k ,j+1UL) ) );
8056 const SIMDType b3(
set( B(k+1UL,j ) ) );
8057 const SIMDType b4(
set( B(k+1UL,j+1UL) ) );
8068 for( ; k<kend; ++k ) {
8069 const SIMDType a1( A.load(i ,k) );
8070 const SIMDType a2( A.load(i+
SIMDSIZE,k) );
8071 const SIMDType b1(
set( B(k,j ) ) );
8072 const SIMDType b2(
set( B(k,j+1UL) ) );
8079 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8081 C.store( i , j+1UL, C.load(i ,j+1UL) - (xmm3+xmm7) * factor );
8082 C.store( i+
SIMDSIZE, j+1UL, C.load(i+
SIMDSIZE,j+1UL) - (xmm4+xmm8) * factor );
8087 const size_t kbegin( ( IsLower_v<MT5> )
8088 ?( ( IsUpper_v<MT4> )
8089 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8090 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8091 :( IsUpper_v<MT4> ? i : 0UL ) );
8092 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
8094 SIMDType xmm1, xmm2, xmm3, xmm4;
8097 for( ; (k+2UL) <= kend; k+=2UL ) {
8098 const SIMDType b1(
set( B(k ,j) ) );
8099 const SIMDType b2(
set( B(k+1UL,j) ) );
8100 xmm1 += A.load(i ,k ) * b1;
8101 xmm2 += A.load(i+
SIMDSIZE,k ) * b1;
8102 xmm3 += A.load(i ,k+1UL) * b2;
8103 xmm4 += A.load(i+
SIMDSIZE,k+1UL) * b2;
8106 for( ; k<kend; ++k ) {
8107 const SIMDType b1(
set( B(k,j) ) );
8108 xmm1 += A.load(i ,k) * b1;
8112 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8119 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
8120 size_t j( UPP ? i : 0UL );
8122 for( ; (j+4UL) <= jend; j+=4UL )
8124 const size_t kbegin( ( IsLower_v<MT5> )
8125 ?( ( IsUpper_v<MT4> )
8126 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8127 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8128 :( IsUpper_v<MT4> ? i : 0UL ) );
8129 const size_t kend( ( IsUpper_v<MT5> )
8130 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8133 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8136 for( ; (k+2UL) <= kend; k+=2UL ) {
8137 const SIMDType a1( A.load(i,k ) );
8138 const SIMDType a2( A.load(i,k+1UL) );
8139 xmm1 += a1 *
set( B(k ,j ) );
8140 xmm2 += a1 *
set( B(k ,j+1UL) );
8141 xmm3 += a1 *
set( B(k ,j+2UL) );
8142 xmm4 += a1 *
set( B(k ,j+3UL) );
8143 xmm5 += a2 *
set( B(k+1UL,j ) );
8144 xmm6 += a2 *
set( B(k+1UL,j+1UL) );
8145 xmm7 += a2 *
set( B(k+1UL,j+2UL) );
8146 xmm8 += a2 *
set( B(k+1UL,j+3UL) );
8149 for( ; k<kend; ++k ) {
8150 const SIMDType a1( A.load(i,k) );
8151 xmm1 += a1 *
set( B(k,j ) );
8152 xmm2 += a1 *
set( B(k,j+1UL) );
8153 xmm3 += a1 *
set( B(k,j+2UL) );
8154 xmm4 += a1 *
set( B(k,j+3UL) );
8157 C.store( i, j , C.load(i,j ) - (xmm1+xmm5) * factor );
8158 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm6) * factor );
8159 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm7) * factor );
8160 C.store( i, j+3UL, C.load(i,j+3UL) - (xmm4+xmm8) * factor );
8163 for( ; (j+3UL) <= jend; j+=3UL )
8165 const size_t kbegin( ( IsLower_v<MT5> )
8166 ?( ( IsUpper_v<MT4> )
8167 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8168 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8169 :( IsUpper_v<MT4> ? i : 0UL ) );
8170 const size_t kend( ( IsUpper_v<MT5> )
8171 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8174 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8177 for( ; (k+2UL) <= kend; k+=2UL ) {
8178 const SIMDType a1( A.load(i,k ) );
8179 const SIMDType a2( A.load(i,k+1UL) );
8180 xmm1 += a1 *
set( B(k ,j ) );
8181 xmm2 += a1 *
set( B(k ,j+1UL) );
8182 xmm3 += a1 *
set( B(k ,j+2UL) );
8183 xmm4 += a2 *
set( B(k+1UL,j ) );
8184 xmm5 += a2 *
set( B(k+1UL,j+1UL) );
8185 xmm6 += a2 *
set( B(k+1UL,j+2UL) );
8188 for( ; k<kend; ++k ) {
8189 const SIMDType a1( A.load(i,k) );
8190 xmm1 += a1 *
set( B(k,j ) );
8191 xmm2 += a1 *
set( B(k,j+1UL) );
8192 xmm3 += a1 *
set( B(k,j+2UL) );
8195 C.store( i, j , C.load(i,j ) - (xmm1+xmm4) * factor );
8196 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm5) * factor );
8197 C.store( i, j+2UL, C.load(i,j+2UL) - (xmm3+xmm6) * factor );
8200 for( ; (j+2UL) <= jend; j+=2UL )
8202 const size_t kbegin( ( IsLower_v<MT5> )
8203 ?( ( IsUpper_v<MT4> )
8204 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8205 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8206 :( IsUpper_v<MT4> ? i : 0UL ) );
8207 const size_t kend( ( IsUpper_v<MT5> )
8208 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8211 SIMDType xmm1, xmm2, xmm3, xmm4;
8214 for( ; (k+2UL) <= kend; k+=2UL ) {
8215 const SIMDType a1( A.load(i,k ) );
8216 const SIMDType a2( A.load(i,k+1UL) );
8217 xmm1 += a1 *
set( B(k ,j ) );
8218 xmm2 += a1 *
set( B(k ,j+1UL) );
8219 xmm3 += a2 *
set( B(k+1UL,j ) );
8220 xmm4 += a2 *
set( B(k+1UL,j+1UL) );
8223 for( ; k<kend; ++k ) {
8224 const SIMDType a1( A.load(i,k) );
8225 xmm1 += a1 *
set( B(k,j ) );
8226 xmm2 += a1 *
set( B(k,j+1UL) );
8229 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8230 C.store( i, j+1UL, C.load(i,j+1UL) - (xmm2+xmm4) * factor );
8235 const size_t kbegin( ( IsLower_v<MT5> )
8236 ?( ( IsUpper_v<MT4> )
8237 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8238 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8239 :( IsUpper_v<MT4> ? i : 0UL ) );
8241 SIMDType xmm1, xmm2;
8244 for( ; (k+2UL) <= K; k+=2UL ) {
8245 xmm1 += A.load(i,k ) *
set( B(k ,j) );
8246 xmm2 += A.load(i,k+1UL) *
set( B(k+1UL,j) );
8250 xmm1 += A.load(i,k) *
set( B(k,j) );
8253 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8257 for( ; remainder && i<M; ++i )
8259 const size_t jend( LOW ? i+1UL : N );
8260 size_t j( UPP ? i : 0UL );
8262 for( ; (j+2UL) <= jend; j+=2UL )
8264 const size_t kbegin( ( IsLower_v<MT5> )
8265 ?( ( IsUpper_v<MT4> )
8266 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8267 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8268 :( IsUpper_v<MT4> ? i : 0UL ) );
8269 const size_t kend( ( IsUpper_v<MT5> )
8270 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8276 for(
size_t k=kbegin; k<kend; ++k ) {
8277 value1 += A(i,k) * B(k,j );
8278 value2 += A(i,k) * B(k,j+1UL);
8281 C(i,j ) -= value1 * scalar;
8282 C(i,j+1UL) -= value2 * scalar;
8287 const size_t kbegin( ( IsLower_v<MT5> )
8288 ?( ( IsUpper_v<MT4> )
8289 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8290 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8291 :( IsUpper_v<MT4> ? i : 0UL ) );
8295 for(
size_t k=kbegin; k<K; ++k ) {
8296 value += A(i,k) * B(k,j);
8299 C(i,j) -= value * scalar;
8319 template<
typename MT3
8323 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8324 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8326 selectDefaultSubAssignKernel( C, A, B, scalar );
8345 template<
typename MT3
8349 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8350 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8353 lmmm( C, A, B, -scalar, ST2(1) );
8355 ummm( C, A, B, -scalar, ST2(1) );
8357 mmm( C, A, B, -scalar, ST2(1) );
8376 template<
typename MT3
8380 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8381 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8383 selectLargeSubAssignKernel( C, A, B, scalar );
8388 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8402 template<
typename MT3
8406 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8407 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8409 using ET = ElementType_t<MT3>;
8411 if( IsTriangular_v<MT4> ) {
8412 ResultType_t<MT3> tmp(
serial( B ) );
8413 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8414 subAssign( C, tmp );
8416 else if( IsTriangular_v<MT5> ) {
8417 ResultType_t<MT3> tmp(
serial( A ) );
8418 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8419 subAssign( C, tmp );
8422 gemm( C, A, B,
ET(-scalar),
ET(1) );
8442 template<
typename MT >
8444 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8453 const ForwardFunctor fwd;
8455 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8456 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8458 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8459 subAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
8460 else if( IsSymmetric_v<MT1> )
8461 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
8463 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
8483 template<
typename MT
8485 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8497 schurAssign( ~lhs, tmp );
8528 template<
typename MT
8531 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8538 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8539 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8541 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8544 else if( left.columns() == 0UL ) {
8578 template<
typename MT
8581 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8585 using TmpType = If_t< SO, ResultType, OppositeType >;
8597 const ForwardFunctor fwd;
8599 const TmpType tmp( rhs );
8618 template<
typename MT >
8620 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8629 const ForwardFunctor fwd;
8631 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8632 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8634 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8636 else if( IsSymmetric_v<MT1> )
8658 template<
typename MT
8661 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8668 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8669 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8671 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8704 template<
typename MT >
8706 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8715 const ForwardFunctor fwd;
8717 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8718 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8720 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8722 else if( IsSymmetric_v<MT1> )
8748 template<
typename MT
8751 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8758 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8759 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8761 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8794 template<
typename MT >
8796 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8805 const ForwardFunctor fwd;
8807 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8808 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8810 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8812 else if( IsSymmetric_v<MT1> )
8835 template<
typename MT
8915 template<
typename MT1
8917 inline decltype(
auto)
8963 template<
typename MT1
8969 inline decltype(
auto)
declsym( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8977 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8978 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9007 template<
typename MT1
9013 inline decltype(
auto)
declherm( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9021 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9022 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9051 template<
typename MT1
9057 inline decltype(
auto)
decllow( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9065 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9066 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9095 template<
typename MT1
9101 inline decltype(
auto)
declupp( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9109 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9110 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9139 template<
typename MT1
9145 inline decltype(
auto)
decldiag( const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9153 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9154 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9170 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9171 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9172 :
public Size<MT1,0UL>
9175 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9176 struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9177 :
public Size<MT2,1UL>
9193 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9194 struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9195 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
Header file for basic type definitions.
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:484
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:308
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:300
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:288
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:330
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Expression object for transpose dense matrix-transpose dense matrix multiplications.The TDMatTDMatMultExpr class represents the compile time expression for multiplications between two column-major dense matrices.
Definition: Forward.h:163
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:430
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
Header file for the IsComplexDouble type trait.
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:178
Constraint on the data type.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:321
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:171
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:440
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:464
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:497
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:294
Header file for the IsSIMDCombinable type trait.
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:420
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:303
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:284
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:177
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:315
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:175
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:166
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:394
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:287
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:176
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:410
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:452
Header file for BLAS general matrix/matrix multiplication functions (gemm)
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:289
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:345
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:474
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:297
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:291
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:161
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:290
Header file for the IsExpression type trait class.
Header file for the function trace functionality.