35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 146 template<
typename MT1
153 :
public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
168 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
173 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
177 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
178 static constexpr
bool HERM = ( HF && !( LF || UF ) );
179 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
180 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
190 template<
typename T1,
typename T2,
typename T3 >
191 static constexpr
bool CanExploitSymmetry_v =
192 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
202 template<
typename T1,
typename T2,
typename T3 >
203 static constexpr
bool IsEvaluationRequired_v =
213 template<
typename T1,
typename T2,
typename T3 >
214 static constexpr
bool UseBlasKernel_v =
217 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
218 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
219 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
220 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
221 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
222 IsBLASCompatible_v< ElementType_t<T1> > &&
223 IsBLASCompatible_v< ElementType_t<T2> > &&
224 IsBLASCompatible_v< ElementType_t<T3> > &&
235 template<
typename T1,
typename T2,
typename T3 >
236 static constexpr
bool UseVectorizedDefaultKernel_v =
237 ( useOptimizedKernels &&
238 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
239 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
240 IsSIMDCombinable_v< ElementType_t<T1>
311 ( !IsDiagonal_v<MT2> &&
312 MT1::simdEnabled && MT2::simdEnabled &&
313 HasSIMDAdd_v<ET1,ET2> &&
314 HasSIMDMult_v<ET1,ET2> );
351 if( IsDiagonal_v<MT1> ) {
354 else if( IsDiagonal_v<MT2> ) {
357 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
358 const size_t begin( ( IsUpper_v<MT1> )
359 ?( ( IsLower_v<MT2> )
360 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
361 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
362 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
363 :( ( IsLower_v<MT2> )
364 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
366 const size_t end( ( IsLower_v<MT1> )
367 ?( ( IsUpper_v<MT2> )
368 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
369 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
370 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
371 :( ( IsUpper_v<MT2> )
372 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
373 :(
lhs_.columns() ) ) );
397 if( i >=
lhs_.rows() ) {
400 if( j >=
rhs_.columns() ) {
412 inline size_t rows() const noexcept {
423 return rhs_.columns();
453 template<
typename T >
454 inline bool canAlias(
const T* alias )
const noexcept {
455 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
465 template<
typename T >
466 inline bool isAliased(
const T* alias )
const noexcept {
467 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
477 return lhs_.isAligned() &&
rhs_.isAligned();
488 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
490 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
491 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
492 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
515 template<
typename MT
525 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
528 else if( rhs.lhs_.columns() == 0UL ) {
543 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
559 template<
typename MT3
562 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
564 if( ( IsDiagonal_v<MT5> ) ||
565 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
566 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
567 selectSmallAssignKernel( C, A, B );
569 selectBlasAssignKernel( C, A, B );
588 template<
typename MT3
591 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
592 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
594 const size_t M( A.rows() );
595 const size_t N( B.columns() );
596 const size_t K( A.columns() );
600 for(
size_t i=0UL; i<M; ++i )
602 const size_t kbegin( ( IsUpper_v<MT4> )
603 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
605 const size_t kend( ( IsLower_v<MT4> )
606 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
610 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
611 for(
size_t j=0UL; j<N; ++j ) {
618 const size_t jbegin( ( IsUpper_v<MT5> )
619 ?( ( IsStrictlyUpper_v<MT5> )
620 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
621 :(
UPP ?
max(i,kbegin) : kbegin ) )
622 :(
UPP ? i : 0UL ) );
623 const size_t jend( ( IsLower_v<MT5> )
624 ?( ( IsStrictlyLower_v<MT5> )
625 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
626 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
627 :(
LOW ? i+1UL : N ) );
629 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
630 for(
size_t j=0UL; j<jbegin; ++j ) {
634 else if( IsStrictlyUpper_v<MT5> ) {
637 for(
size_t j=jbegin; j<jend; ++j ) {
638 C(i,j) = A(i,kbegin) * B(kbegin,j);
640 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
641 for(
size_t j=jend; j<N; ++j ) {
645 else if( IsStrictlyLower_v<MT5> ) {
650 for(
size_t k=kbegin+1UL; k<kend; ++k )
652 const size_t jbegin( ( IsUpper_v<MT5> )
653 ?( ( IsStrictlyUpper_v<MT5> )
657 const size_t jend( ( IsLower_v<MT5> )
658 ?( ( IsStrictlyLower_v<MT5> )
659 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
660 :(
LOW ?
min(i+1UL,k) : k ) )
661 :(
LOW ? i+1UL : N ) );
663 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
666 for(
size_t j=jbegin; j<jend; ++j ) {
667 C(i,j) += A(i,k) * B(k,j);
669 if( IsLower_v<MT5> ) {
670 C(i,jend) = A(i,k) * B(k,jend);
676 for(
size_t i=1UL; i<M; ++i ) {
677 for(
size_t j=0UL; j<i; ++j ) {
678 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
700 template<
typename MT3
703 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
704 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
708 const size_t M( A.rows() );
709 const size_t N( B.columns() );
711 for(
size_t i=0UL; i<M; ++i )
713 const size_t jbegin( ( IsUpper_v<MT4> )
714 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
716 const size_t jend( ( IsLower_v<MT4> )
717 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
721 if( IsUpper_v<MT4> ) {
722 for(
size_t j=0UL; j<jbegin; ++j ) {
726 for(
size_t j=jbegin; j<jend; ++j ) {
727 C(i,j) = A(i,j) * B(j,j);
729 if( IsLower_v<MT4> ) {
730 for(
size_t j=jend; j<N; ++j ) {
753 template<
typename MT3
756 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
757 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
761 const size_t M( A.rows() );
762 const size_t N( B.columns() );
764 for(
size_t i=0UL; i<M; ++i )
766 const size_t jbegin( ( IsUpper_v<MT5> )
767 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
769 const size_t jend( ( IsLower_v<MT5> )
770 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
774 if( IsUpper_v<MT5> ) {
775 for(
size_t j=0UL; j<jbegin; ++j ) {
779 for(
size_t j=jbegin; j<jend; ++j ) {
780 C(i,j) = A(i,i) * B(i,j);
782 if( IsLower_v<MT5> ) {
783 for(
size_t j=jend; j<N; ++j ) {
806 template<
typename MT3
809 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
810 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
816 for(
size_t i=0UL; i<A.rows(); ++i ) {
817 C(i,i) = A(i,i) * B(i,i);
836 template<
typename MT3
839 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
840 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
842 selectDefaultAssignKernel( C, A, B );
862 template<
typename MT3
865 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
866 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
868 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
870 const size_t M( A.rows() );
871 const size_t N( B.columns() );
872 const size_t K( A.columns() );
876 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
886 if( IsIntegral_v<ElementType> )
889 for(
size_t i=0UL; i<M; ++i )
891 const size_t kbegin( ( IsUpper_v<MT4> )
892 ?( ( IsLower_v<MT5> )
893 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
894 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
895 :( IsLower_v<MT5> ? j : 0UL ) );
896 const size_t kend( ( IsLower_v<MT4> )
897 ?( ( IsUpper_v<MT5> )
898 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
899 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
900 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
902 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
904 for(
size_t k=kbegin; k<kend; ++k ) {
906 xmm1 += a1 * B.load(k,j );
908 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
909 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
910 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
911 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
912 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
913 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
916 C.store( i, j , xmm1 );
932 for( ; (i+2UL) <= M; i+=2UL )
934 const size_t kbegin( ( IsUpper_v<MT4> )
935 ?( ( IsLower_v<MT5> )
936 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
937 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
938 :( IsLower_v<MT5> ? j : 0UL ) );
939 const size_t kend( ( IsLower_v<MT4> )
940 ?( ( IsUpper_v<MT5> )
941 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
942 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
943 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
945 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
947 for(
size_t k=kbegin; k<kend; ++k ) {
948 const SIMDType a1(
set( A(i ,k) ) );
949 const SIMDType a2(
set( A(i+1UL,k) ) );
967 C.store( i , j , xmm1 );
969 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
970 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
971 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
972 C.store( i+1UL, j , xmm6 );
973 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
974 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
975 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
976 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
981 const size_t kbegin( ( IsUpper_v<MT4> )
982 ?( ( IsLower_v<MT5> )
983 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
984 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
985 :( IsLower_v<MT5> ? j : 0UL ) );
986 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
988 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
990 for(
size_t k=kbegin; k<kend; ++k ) {
992 xmm1 += a1 * B.load(k,j );
994 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
995 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
996 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
999 C.store( i, j , xmm1 );
1001 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1002 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1003 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1010 size_t i(
LOW ? j : 0UL );
1012 for( ; (i+2UL) <= iend; i+=2UL )
1014 const size_t kbegin( ( IsUpper_v<MT4> )
1015 ?( ( IsLower_v<MT5> )
1016 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1017 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1018 :( IsLower_v<MT5> ? j : 0UL ) );
1019 const size_t kend( ( IsLower_v<MT4> )
1020 ?( ( IsUpper_v<MT5> )
1021 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1022 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1023 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1025 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1027 for(
size_t k=kbegin; k<kend; ++k ) {
1028 const SIMDType a1(
set( A(i ,k) ) );
1029 const SIMDType a2(
set( A(i+1UL,k) ) );
1044 C.store( i , j , xmm1 );
1046 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1047 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1048 C.store( i+1UL, j , xmm5 );
1049 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1050 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1051 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1056 const size_t kbegin( ( IsUpper_v<MT4> )
1057 ?( ( IsLower_v<MT5> )
1058 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1059 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1060 :( IsLower_v<MT5> ? j : 0UL ) );
1061 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1065 for(
size_t k=kbegin; k<kend; ++k ) {
1066 const SIMDType a1(
set( A(i,k) ) );
1067 xmm1 += a1 * B.load(k,j );
1068 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1069 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1070 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1073 C.store( i, j , xmm1 );
1075 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1076 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1083 size_t i(
LOW ? j : 0UL );
1085 for( ; (i+2UL) <= iend; i+=2UL )
1087 const size_t kbegin( ( IsUpper_v<MT4> )
1088 ?( ( IsLower_v<MT5> )
1089 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1090 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1091 :( IsLower_v<MT5> ? j : 0UL ) );
1092 const size_t kend( ( IsLower_v<MT4> )
1093 ?( ( IsUpper_v<MT5> )
1094 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1095 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1096 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1098 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1100 for(
size_t k=kbegin; k<kend; ++k ) {
1101 const SIMDType a1(
set( A(i ,k) ) );
1102 const SIMDType a2(
set( A(i+1UL,k) ) );
1114 C.store( i , j , xmm1 );
1116 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1117 C.store( i+1UL, j , xmm4 );
1118 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1119 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1124 const size_t kbegin( ( IsUpper_v<MT4> )
1125 ?( ( IsLower_v<MT5> )
1126 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1127 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1128 :( IsLower_v<MT5> ? j : 0UL ) );
1129 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1133 for(
size_t k=kbegin; k<kend; ++k ) {
1134 const SIMDType a1(
set( A(i,k) ) );
1135 xmm1 += a1 * B.load(k,j );
1136 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1137 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1140 C.store( i, j , xmm1 );
1142 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1149 size_t i(
LOW ? j : 0UL );
1151 for( ; (i+4UL) <= iend; i+=4UL )
1153 const size_t kbegin( ( IsUpper_v<MT4> )
1154 ?( ( IsLower_v<MT5> )
1155 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1156 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1157 :( IsLower_v<MT5> ? j : 0UL ) );
1158 const size_t kend( ( IsLower_v<MT4> )
1159 ?( ( IsUpper_v<MT5> )
1160 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1161 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1162 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1164 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1166 for(
size_t k=kbegin; k<kend; ++k ) {
1167 const SIMDType a1(
set( A(i ,k) ) );
1168 const SIMDType a2(
set( A(i+1UL,k) ) );
1169 const SIMDType a3(
set( A(i+2UL,k) ) );
1170 const SIMDType a4(
set( A(i+3UL,k) ) );
1183 C.store( i , j , xmm1 );
1185 C.store( i+1UL, j , xmm3 );
1186 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1187 C.store( i+2UL, j , xmm5 );
1188 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1189 C.store( i+3UL, j , xmm7 );
1190 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1193 for( ; (i+3UL) <= iend; i+=3UL )
1195 const size_t kbegin( ( IsUpper_v<MT4> )
1196 ?( ( IsLower_v<MT5> )
1197 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1198 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1199 :( IsLower_v<MT5> ? j : 0UL ) );
1200 const size_t kend( ( IsLower_v<MT4> )
1201 ?( ( IsUpper_v<MT5> )
1202 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1203 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1204 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1206 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1208 for(
size_t k=kbegin; k<kend; ++k ) {
1209 const SIMDType a1(
set( A(i ,k) ) );
1210 const SIMDType a2(
set( A(i+1UL,k) ) );
1211 const SIMDType a3(
set( A(i+2UL,k) ) );
1222 C.store( i , j , xmm1 );
1224 C.store( i+1UL, j , xmm3 );
1225 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1226 C.store( i+2UL, j , xmm5 );
1227 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1230 for( ; (i+2UL) <= iend; i+=2UL )
1232 const size_t kbegin( ( IsUpper_v<MT4> )
1233 ?( ( IsLower_v<MT5> )
1234 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1235 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1236 :( IsLower_v<MT5> ? j : 0UL ) );
1237 const size_t kend( ( IsLower_v<MT4> )
1238 ?( ( IsUpper_v<MT5> )
1239 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1240 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1241 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1243 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1246 for( ; (k+2UL) <= kend; k+=2UL ) {
1247 const SIMDType a1(
set( A(i ,k ) ) );
1248 const SIMDType a2(
set( A(i+1UL,k ) ) );
1249 const SIMDType a3(
set( A(i ,k+1UL) ) );
1250 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
1251 const SIMDType b1( B.load(k ,j ) );
1253 const SIMDType b3( B.load(k+1UL,j ) );
1265 for( ; k<kend; ++k ) {
1266 const SIMDType a1(
set( A(i ,k) ) );
1267 const SIMDType a2(
set( A(i+1UL,k) ) );
1276 C.store( i , j , xmm1+xmm5 );
1277 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
1278 C.store( i+1UL, j , xmm3+xmm7 );
1279 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
1284 const size_t kbegin( ( IsUpper_v<MT4> )
1285 ?( ( IsLower_v<MT5> )
1286 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1287 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1288 :( IsLower_v<MT5> ? j : 0UL ) );
1289 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1294 for( ; (k+2UL) <= kend; k+=2UL ) {
1295 const SIMDType a1(
set( A(i,k ) ) );
1296 const SIMDType a2(
set( A(i,k+1UL) ) );
1297 xmm1 += a1 * B.load(k ,j );
1298 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
1299 xmm3 += a2 * B.load(k+1UL,j );
1300 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
1303 for( ; k<kend; ++k ) {
1304 const SIMDType a1(
set( A(i,k) ) );
1305 xmm1 += a1 * B.load(k,j );
1309 C.store( i, j , xmm1+xmm3 );
1310 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
1317 size_t i(
LOW ? j : 0UL );
1319 for( ; (i+4UL) <= iend; i+=4UL )
1321 const size_t kbegin( ( IsUpper_v<MT4> )
1322 ?( ( IsLower_v<MT5> )
1323 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1324 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1325 :( IsLower_v<MT5> ? j : 0UL ) );
1326 const size_t kend( ( IsLower_v<MT4> )
1327 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1330 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1333 for( ; (k+2UL) <= kend; k+=2UL ) {
1335 const SIMDType b2( B.load(k+1UL,j) );
1336 xmm1 +=
set( A(i ,k ) ) * b1;
1337 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1338 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1339 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1340 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1341 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1342 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1343 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1346 for( ; k<kend; ++k ) {
1348 xmm1 +=
set( A(i ,k) ) * b1;
1349 xmm2 +=
set( A(i+1UL,k) ) * b1;
1350 xmm3 +=
set( A(i+2UL,k) ) * b1;
1351 xmm4 +=
set( A(i+3UL,k) ) * b1;
1354 C.store( i , j, xmm1+xmm5 );
1355 C.store( i+1UL, j, xmm2+xmm6 );
1356 C.store( i+2UL, j, xmm3+xmm7 );
1357 C.store( i+3UL, j, xmm4+xmm8 );
1360 for( ; (i+3UL) <= iend; i+=3UL )
1362 const size_t kbegin( ( IsUpper_v<MT4> )
1363 ?( ( IsLower_v<MT5> )
1364 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1365 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1366 :( IsLower_v<MT5> ? j : 0UL ) );
1367 const size_t kend( ( IsLower_v<MT4> )
1368 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1371 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1374 for( ; (k+2UL) <= kend; k+=2UL ) {
1376 const SIMDType b2( B.load(k+1UL,j) );
1377 xmm1 +=
set( A(i ,k ) ) * b1;
1378 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1379 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1380 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1381 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1382 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1385 for( ; k<kend; ++k ) {
1387 xmm1 +=
set( A(i ,k) ) * b1;
1388 xmm2 +=
set( A(i+1UL,k) ) * b1;
1389 xmm3 +=
set( A(i+2UL,k) ) * b1;
1392 C.store( i , j, xmm1+xmm4 );
1393 C.store( i+1UL, j, xmm2+xmm5 );
1394 C.store( i+2UL, j, xmm3+xmm6 );
1397 for( ; (i+2UL) <= iend; i+=2UL )
1399 const size_t kbegin( ( IsUpper_v<MT4> )
1400 ?( ( IsLower_v<MT5> )
1401 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1402 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1403 :( IsLower_v<MT5> ? j : 0UL ) );
1404 const size_t kend( ( IsLower_v<MT4> )
1405 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1411 for( ; (k+2UL) <= kend; k+=2UL ) {
1413 const SIMDType b2( B.load(k+1UL,j) );
1414 xmm1 +=
set( A(i ,k ) ) * b1;
1415 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1416 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1417 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1420 for( ; k<kend; ++k ) {
1422 xmm1 +=
set( A(i ,k) ) * b1;
1423 xmm2 +=
set( A(i+1UL,k) ) * b1;
1426 C.store( i , j, xmm1+xmm3 );
1427 C.store( i+1UL, j, xmm2+xmm4 );
1432 const size_t kbegin( ( IsUpper_v<MT4> )
1433 ?( ( IsLower_v<MT5> )
1434 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1435 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1436 :( IsLower_v<MT5> ? j : 0UL ) );
1441 for( ; (k+2UL) <= K; k+=2UL ) {
1442 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1443 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1447 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1450 C.store( i, j, xmm1+xmm2 );
1454 for( ; remainder && j<N; ++j )
1456 size_t i(
LOW &&
UPP ? j : 0UL );
1458 for( ; (i+2UL) <= M; i+=2UL )
1460 const size_t kbegin( ( IsUpper_v<MT4> )
1461 ?( ( IsLower_v<MT5> )
1462 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1464 :( IsLower_v<MT5> ? j : 0UL ) );
1465 const size_t kend( ( IsLower_v<MT4> )
1466 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1472 for(
size_t k=kbegin; k<kend; ++k ) {
1473 value1 += A(i ,k) * B(k,j);
1474 value2 += A(i+1UL,k) * B(k,j);
1478 C(i+1UL,j) = value2;
1483 const size_t kbegin( ( IsUpper_v<MT4> )
1484 ?( ( IsLower_v<MT5> )
1485 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1486 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1487 :( IsLower_v<MT5> ? j : 0UL ) );
1491 for(
size_t k=kbegin; k<K; ++k ) {
1492 value += A(i,k) * B(k,j);
1501 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
1503 for(
size_t j=0UL; j<jend; ++j ) {
1504 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1509 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
1511 for(
size_t i=0UL; i<iend; ++i ) {
1517 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
1519 for(
size_t j=0UL; j<jend; ++j ) {
1543 template<
typename MT3
1546 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1547 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1554 const ForwardFunctor fwd;
1556 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1557 const OppositeType_t<MT4> tmp(
serial( A ) );
1558 assign( C, fwd( tmp * B ) );
1560 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1561 const OppositeType_t<MT5> tmp(
serial( B ) );
1562 assign( C, fwd( A * tmp ) );
1564 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1565 const OppositeType_t<MT4> tmp(
serial( A ) );
1566 assign( C, fwd( tmp * B ) );
1569 const OppositeType_t<MT5> tmp(
serial( B ) );
1570 assign( C, fwd( A * tmp ) );
1589 template<
typename MT3
1592 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1593 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1595 selectDefaultAssignKernel( C, A, B );
1614 template<
typename MT3
1617 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1618 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1647 template<
typename MT3
1650 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1651 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1653 selectLargeAssignKernel( C, A, B );
1659 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1672 template<
typename MT3
1675 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1676 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1678 using ET = ElementType_t<MT3>;
1680 if( IsTriangular_v<MT4> ) {
1682 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1684 else if( IsTriangular_v<MT5> ) {
1686 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1689 gemm( C, A, B, ET(1), ET(0) );
1709 template<
typename MT
1711 friend inline auto assign( SparseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
1712 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1716 using TmpType = If_t< SO, OppositeType, ResultType >;
1728 const ForwardFunctor fwd;
1730 const TmpType tmp(
serial( rhs ) );
1731 assign( ~lhs, fwd( tmp ) );
1751 template<
typename MT >
1752 friend inline auto assign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
1753 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1762 const ForwardFunctor fwd;
1764 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1765 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1766 else if( IsSymmetric_v<MT1> )
1767 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1769 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1787 template<
typename MT
1789 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
1790 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1797 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1811 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1827 template<
typename MT3
1830 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1832 if( ( IsDiagonal_v<MT5> ) ||
1833 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
1834 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1835 selectSmallAddAssignKernel( C, A, B );
1837 selectBlasAddAssignKernel( C, A, B );
1856 template<
typename MT3
1859 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1860 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1862 const size_t M( A.rows() );
1863 const size_t N( B.columns() );
1864 const size_t K( A.columns() );
1868 for(
size_t i=0UL; i<M; ++i )
1870 const size_t kbegin( ( IsUpper_v<MT4> )
1871 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1873 const size_t kend( ( IsLower_v<MT4> )
1874 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1878 for(
size_t k=kbegin; k<kend; ++k )
1880 const size_t jbegin( ( IsUpper_v<MT5> )
1881 ?( ( IsStrictlyUpper_v<MT5> )
1882 ?(
UPP ?
max(i,k+1UL) : k+1UL )
1883 :(
UPP ?
max(i,k) : k ) )
1884 :(
UPP ? i : 0UL ) );
1885 const size_t jend( ( IsLower_v<MT5> )
1886 ?( ( IsStrictlyLower_v<MT5> )
1887 ?(
LOW ?
min(i+1UL,k) : k )
1888 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
1889 :(
LOW ? i+1UL : N ) );
1891 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
1894 const size_t jnum( jend - jbegin );
1895 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1897 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1898 C(i,j ) += A(i,k) * B(k,j );
1899 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1902 C(i,jpos) += A(i,k) * B(k,jpos);
1924 template<
typename MT3
1927 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1928 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1932 const size_t M( A.rows() );
1933 const size_t N( B.columns() );
1935 for(
size_t i=0UL; i<M; ++i )
1937 const size_t jbegin( ( IsUpper_v<MT4> )
1938 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1940 const size_t jend( ( IsLower_v<MT4> )
1941 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1945 const size_t jnum( jend - jbegin );
1946 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1948 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1949 C(i,j ) += A(i,j ) * B(j ,j );
1950 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
1953 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
1974 template<
typename MT3
1977 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1978 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1982 const size_t M( A.rows() );
1983 const size_t N( B.columns() );
1985 for(
size_t i=0UL; i<M; ++i )
1987 const size_t jbegin( ( IsUpper_v<MT5> )
1988 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
1990 const size_t jend( ( IsLower_v<MT5> )
1991 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
1995 const size_t jnum( jend - jbegin );
1996 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1998 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1999 C(i,j ) += A(i,i) * B(i,j );
2000 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2003 C(i,jpos) += A(i,i) * B(i,jpos);
2024 template<
typename MT3
2027 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2028 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2032 for(
size_t i=0UL; i<A.rows(); ++i ) {
2033 C(i,i) += A(i,i) * B(i,i);
2053 template<
typename MT3
2056 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2057 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2059 selectDefaultAddAssignKernel( C, A, B );
2079 template<
typename MT3
2082 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2083 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2085 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2087 const size_t M( A.rows() );
2088 const size_t N( B.columns() );
2089 const size_t K( A.columns() );
2093 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
2098 if( IsIntegral_v<ElementType> )
2101 for(
size_t i=0UL; i<M; ++i )
2103 const size_t kbegin( ( IsUpper_v<MT4> )
2104 ?( ( IsLower_v<MT5> )
2105 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2106 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2107 :( IsLower_v<MT5> ? j : 0UL ) );
2108 const size_t kend( ( IsLower_v<MT4> )
2109 ?( ( IsUpper_v<MT5> )
2110 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
2111 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2112 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
2123 for(
size_t k=kbegin; k<kend; ++k ) {
2124 const SIMDType a1(
set( A(i,k) ) );
2125 xmm1 += a1 * B.load(k,j );
2126 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2127 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2128 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2129 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2130 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
2131 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
2132 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
2135 C.store( i, j , xmm1 );
2137 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2138 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2139 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2140 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
2141 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
2142 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
2151 for( ; (i+2UL) <= M; i+=2UL )
2153 const size_t kbegin( ( IsUpper_v<MT4> )
2154 ?( ( IsLower_v<MT5> )
2155 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2156 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2157 :( IsLower_v<MT5> ? j : 0UL ) );
2158 const size_t kend( ( IsLower_v<MT4> )
2159 ?( ( IsUpper_v<MT5> )
2160 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
2161 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2162 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
2169 SIMDType xmm6 ( C.load(i+1UL,j ) );
2175 for(
size_t k=kbegin; k<kend; ++k ) {
2176 const SIMDType a1(
set( A(i ,k) ) );
2177 const SIMDType a2(
set( A(i+1UL,k) ) );
2195 C.store( i , j , xmm1 );
2197 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2198 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2199 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
2200 C.store( i+1UL, j , xmm6 );
2201 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
2202 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
2203 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
2204 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
2209 const size_t kbegin( ( IsUpper_v<MT4> )
2210 ?( ( IsLower_v<MT5> )
2211 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2212 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2213 :( IsLower_v<MT5> ? j : 0UL ) );
2214 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
2222 for(
size_t k=kbegin; k<kend; ++k ) {
2223 const SIMDType a1(
set( A(i,k) ) );
2224 xmm1 += a1 * B.load(k,j );
2225 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2226 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2227 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2228 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2231 C.store( i, j , xmm1 );
2233 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2234 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2235 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2243 for( ; (i+2UL) <= M; i+=2UL )
2245 const size_t kbegin( ( IsUpper_v<MT4> )
2246 ?( ( IsLower_v<MT5> )
2247 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2248 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2249 :( IsLower_v<MT5> ? j : 0UL ) );
2250 const size_t kend( ( IsLower_v<MT4> )
2251 ?( ( IsUpper_v<MT5> )
2252 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
2253 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2254 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
2265 for(
size_t k=kbegin; k<kend; ++k ) {
2266 const SIMDType a1(
set( A(i ,k) ) );
2267 const SIMDType a2(
set( A(i+1UL,k) ) );
2282 C.store( i , j , xmm1 );
2284 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2285 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2286 C.store( i+1UL, j , xmm5 );
2287 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
2288 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
2289 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
2294 const size_t kbegin( ( IsUpper_v<MT4> )
2295 ?( ( IsLower_v<MT5> )
2296 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2297 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2298 :( IsLower_v<MT5> ? j : 0UL ) );
2299 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
2306 for(
size_t k=kbegin; k<kend; ++k ) {
2307 const SIMDType a1(
set( A(i,k) ) );
2308 xmm1 += a1 * B.load(k,j );
2309 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2310 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2311 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2314 C.store( i, j , xmm1 );
2316 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2317 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2325 for( ; (i+2UL) <= M; i+=2UL )
2327 const size_t kbegin( ( IsUpper_v<MT4> )
2328 ?( ( IsLower_v<MT5> )
2329 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2330 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2331 :( IsLower_v<MT5> ? j : 0UL ) );
2332 const size_t kend( ( IsLower_v<MT4> )
2333 ?( ( IsUpper_v<MT5> )
2334 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
2335 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2336 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
2345 for(
size_t k=kbegin; k<kend; ++k ) {
2346 const SIMDType a1(
set( A(i ,k) ) );
2347 const SIMDType a2(
set( A(i+1UL,k) ) );
2359 C.store( i , j , xmm1 );
2361 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2362 C.store( i+1UL, j , xmm4 );
2363 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
2364 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
2369 const size_t kbegin( ( IsUpper_v<MT4> )
2370 ?( ( IsLower_v<MT5> )
2371 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2372 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2373 :( IsLower_v<MT5> ? j : 0UL ) );
2374 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
2380 for(
size_t k=kbegin; k<kend; ++k ) {
2381 const SIMDType a1(
set( A(i,k) ) );
2382 xmm1 += a1 * B.load(k,j );
2383 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2384 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2387 C.store( i, j , xmm1 );
2389 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2396 size_t i(
LOW ? j : 0UL );
2398 for( ; (i+4UL) <= iend; i+=4UL )
2400 const size_t kbegin( ( IsUpper_v<MT4> )
2401 ?( ( IsLower_v<MT5> )
2402 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2403 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2404 :( IsLower_v<MT5> ? j : 0UL ) );
2405 const size_t kend( ( IsLower_v<MT4> )
2406 ?( ( IsUpper_v<MT5> )
2407 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
2408 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2409 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2420 for(
size_t k=kbegin; k<kend; ++k ) {
2421 const SIMDType a1(
set( A(i ,k) ) );
2422 const SIMDType a2(
set( A(i+1UL,k) ) );
2423 const SIMDType a3(
set( A(i+2UL,k) ) );
2424 const SIMDType a4(
set( A(i+3UL,k) ) );
2437 C.store( i , j , xmm1 );
2439 C.store( i+1UL, j , xmm3 );
2440 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2441 C.store( i+2UL, j , xmm5 );
2442 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2443 C.store( i+3UL, j , xmm7 );
2444 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
2447 for( ; (i+3UL) <= iend; i+=3UL )
2449 const size_t kbegin( ( IsUpper_v<MT4> )
2450 ?( ( IsLower_v<MT5> )
2451 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2452 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2453 :( IsLower_v<MT5> ? j : 0UL ) );
2454 const size_t kend( ( IsLower_v<MT4> )
2455 ?( ( IsUpper_v<MT5> )
2456 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
2457 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2458 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2467 for(
size_t k=kbegin; k<kend; ++k ) {
2468 const SIMDType a1(
set( A(i ,k) ) );
2469 const SIMDType a2(
set( A(i+1UL,k) ) );
2470 const SIMDType a3(
set( A(i+2UL,k) ) );
2481 C.store( i , j , xmm1 );
2483 C.store( i+1UL, j , xmm3 );
2484 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2485 C.store( i+2UL, j , xmm5 );
2486 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2489 for( ; (i+2UL) <= iend; i+=2UL )
2491 const size_t kbegin( ( IsUpper_v<MT4> )
2492 ?( ( IsLower_v<MT5> )
2493 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2494 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2495 :( IsLower_v<MT5> ? j : 0UL ) );
2496 const size_t kend( ( IsLower_v<MT4> )
2497 ?( ( IsUpper_v<MT5> )
2498 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
2499 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2500 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2509 for( ; (k+2UL) <= kend; k+=2UL ) {
2510 const SIMDType a1(
set( A(i ,k ) ) );
2511 const SIMDType a2(
set( A(i+1UL,k ) ) );
2512 const SIMDType a3(
set( A(i ,k+1UL) ) );
2513 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
2514 const SIMDType b1( B.load(k ,j ) );
2516 const SIMDType b3( B.load(k+1UL,j ) );
2528 for( ; k<kend; ++k ) {
2529 const SIMDType a1(
set( A(i ,k) ) );
2530 const SIMDType a2(
set( A(i+1UL,k) ) );
2539 C.store( i , j , xmm1+xmm5 );
2540 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
2541 C.store( i+1UL, j , xmm3+xmm7 );
2542 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
2547 const size_t kbegin( ( IsUpper_v<MT4> )
2548 ?( ( IsLower_v<MT5> )
2549 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2550 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2551 :( IsLower_v<MT5> ? j : 0UL ) );
2552 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
2559 for( ; (k+2UL) <= kend; k+=2UL ) {
2560 const SIMDType a1(
set( A(i,k ) ) );
2561 const SIMDType a2(
set( A(i,k+1UL) ) );
2562 xmm1 += a1 * B.load(k ,j );
2563 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
2564 xmm3 += a2 * B.load(k+1UL,j );
2565 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
2568 for( ; k<kend; ++k ) {
2569 const SIMDType a1(
set( A(i,k) ) );
2570 xmm1 += a1 * B.load(k,j );
2574 C.store( i, j , xmm1+xmm3 );
2575 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
2582 size_t i(
LOW ? j : 0UL );
2584 for( ; (i+4UL) <= iend; i+=4UL )
2586 const size_t kbegin( ( IsUpper_v<MT4> )
2587 ?( ( IsLower_v<MT5> )
2588 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2589 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2590 :( IsLower_v<MT5> ? j : 0UL ) );
2591 const size_t kend( ( IsLower_v<MT4> )
2592 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2602 for( ; (k+2UL) <= kend; k+=2UL ) {
2604 const SIMDType b2( B.load(k+1UL,j) );
2605 xmm1 +=
set( A(i ,k ) ) * b1;
2606 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2607 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2608 xmm4 +=
set( A(i+3UL,k ) ) * b1;
2609 xmm5 +=
set( A(i ,k+1UL) ) * b2;
2610 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
2611 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
2612 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
2615 for( ; k<kend; ++k ) {
2617 xmm1 +=
set( A(i ,k) ) * b1;
2618 xmm2 +=
set( A(i+1UL,k) ) * b1;
2619 xmm3 +=
set( A(i+2UL,k) ) * b1;
2620 xmm4 +=
set( A(i+3UL,k) ) * b1;
2623 C.store( i , j, xmm1+xmm5 );
2624 C.store( i+1UL, j, xmm2+xmm6 );
2625 C.store( i+2UL, j, xmm3+xmm7 );
2626 C.store( i+3UL, j, xmm4+xmm8 );
2629 for( ; (i+3UL) <= iend; i+=3UL )
2631 const size_t kbegin( ( IsUpper_v<MT4> )
2632 ?( ( IsLower_v<MT5> )
2633 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2634 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2635 :( IsLower_v<MT5> ? j : 0UL ) );
2636 const size_t kend( ( IsLower_v<MT4> )
2637 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2646 for( ; (k+2UL) <= kend; k+=2UL ) {
2648 const SIMDType b2( B.load(k+1UL,j) );
2649 xmm1 +=
set( A(i ,k ) ) * b1;
2650 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2651 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2652 xmm4 +=
set( A(i ,k+1UL) ) * b2;
2653 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
2654 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
2657 for( ; k<kend; ++k ) {
2659 xmm1 +=
set( A(i ,k) ) * b1;
2660 xmm2 +=
set( A(i+1UL,k) ) * b1;
2661 xmm3 +=
set( A(i+2UL,k) ) * b1;
2664 C.store( i , j, xmm1+xmm4 );
2665 C.store( i+1UL, j, xmm2+xmm5 );
2666 C.store( i+2UL, j, xmm3+xmm6 );
2669 for( ; (i+2UL) <= iend; i+=2UL )
2671 const size_t kbegin( ( IsUpper_v<MT4> )
2672 ?( ( IsLower_v<MT5> )
2673 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2674 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2675 :( IsLower_v<MT5> ? j : 0UL ) );
2676 const size_t kend( ( IsLower_v<MT4> )
2677 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2685 for( ; (k+2UL) <= kend; k+=2UL ) {
2687 const SIMDType b2( B.load(k+1UL,j) );
2688 xmm1 +=
set( A(i ,k ) ) * b1;
2689 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2690 xmm3 +=
set( A(i ,k+1UL) ) * b2;
2691 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
2694 for( ; k<kend; ++k ) {
2696 xmm1 +=
set( A(i ,k) ) * b1;
2697 xmm2 +=
set( A(i+1UL,k) ) * b1;
2700 C.store( i , j, xmm1+xmm3 );
2701 C.store( i+1UL, j, xmm2+xmm4 );
2706 const size_t kbegin( ( IsUpper_v<MT4> )
2707 ?( ( IsLower_v<MT5> )
2708 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2709 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2710 :( IsLower_v<MT5> ? j : 0UL ) );
2716 for( ; (k+2UL) <= K; k+=2UL ) {
2717 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
2718 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
2722 xmm1 +=
set( A(i,k) ) * B.load(k,j);
2725 C.store( i, j, xmm1+xmm2 );
2729 for( ; remainder && j<N; ++j )
2731 const size_t iend(
UPP ? j+1UL : M );
2732 size_t i(
LOW ? j : 0UL );
2734 for( ; (i+2UL) <= iend; i+=2UL )
2736 const size_t kbegin( ( IsUpper_v<MT4> )
2737 ?( ( IsLower_v<MT5> )
2738 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2739 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2740 :( IsLower_v<MT5> ? j : 0UL ) );
2741 const size_t kend( ( IsLower_v<MT4> )
2742 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2748 for(
size_t k=kbegin; k<kend; ++k ) {
2749 value1 += A(i ,k) * B(k,j);
2750 value2 += A(i+1UL,k) * B(k,j);
2754 C(i+1UL,j) = value2;
2759 const size_t kbegin( ( IsUpper_v<MT4> )
2760 ?( ( IsLower_v<MT5> )
2761 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2762 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2763 :( IsLower_v<MT5> ? j : 0UL ) );
2767 for(
size_t k=kbegin; k<K; ++k ) {
2768 value += A(i,k) * B(k,j);
2793 template<
typename MT3
2796 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2797 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2804 const ForwardFunctor fwd;
2806 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2807 const OppositeType_t<MT4> tmp(
serial( A ) );
2808 addAssign( C, fwd( tmp * B ) );
2810 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2811 const OppositeType_t<MT5> tmp(
serial( B ) );
2812 addAssign( C, fwd( A * tmp ) );
2814 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2815 const OppositeType_t<MT4> tmp(
serial( A ) );
2816 addAssign( C, fwd( tmp * B ) );
2819 const OppositeType_t<MT5> tmp(
serial( B ) );
2820 addAssign( C, fwd( A * tmp ) );
2840 template<
typename MT3
2843 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2844 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2846 selectDefaultAddAssignKernel( C, A, B );
2866 template<
typename MT3
2869 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2870 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2896 template<
typename MT3
2899 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2900 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2902 selectLargeAddAssignKernel( C, A, B );
2908 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2922 template<
typename MT3
2925 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2926 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2928 using ET = ElementType_t<MT3>;
2930 if( IsTriangular_v<MT4> ) {
2931 ResultType_t<MT3> tmp(
serial( B ) );
2932 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2933 addAssign( C, tmp );
2935 else if( IsTriangular_v<MT5> ) {
2936 ResultType_t<MT3> tmp(
serial( A ) );
2937 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2938 addAssign( C, tmp );
2941 gemm( C, A, B, ET(1), ET(1) );
2963 template<
typename MT >
2964 friend inline auto addAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
2965 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2974 const ForwardFunctor fwd;
2976 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
2977 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
2978 else if( IsSymmetric_v<MT1> )
2979 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
2981 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
3003 template<
typename MT
3005 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
3006 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3013 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3027 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3043 template<
typename MT3
3046 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3048 if( ( IsDiagonal_v<MT5> ) ||
3049 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
3050 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3051 selectSmallSubAssignKernel( C, A, B );
3053 selectBlasSubAssignKernel( C, A, B );
3072 template<
typename MT3
3075 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3076 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3078 const size_t M( A.rows() );
3079 const size_t N( B.columns() );
3080 const size_t K( A.columns() );
3084 for(
size_t i=0UL; i<M; ++i )
3086 const size_t kbegin( ( IsUpper_v<MT4> )
3087 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3089 const size_t kend( ( IsLower_v<MT4> )
3090 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3094 for(
size_t k=kbegin; k<kend; ++k )
3096 const size_t jbegin( ( IsUpper_v<MT5> )
3097 ?( ( IsStrictlyUpper_v<MT5> )
3098 ?(
UPP ?
max(i,k+1UL) : k+1UL )
3099 :(
UPP ?
max(i,k) : k ) )
3100 :(
UPP ? i : 0UL ) );
3101 const size_t jend( ( IsLower_v<MT5> )
3102 ?( ( IsStrictlyLower_v<MT5> )
3103 ?(
LOW ?
min(i+1UL,k) : k )
3104 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
3105 :(
LOW ? i+1UL : N ) );
3107 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
3110 const size_t jnum( jend - jbegin );
3111 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3113 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3114 C(i,j ) -= A(i,k) * B(k,j );
3115 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3118 C(i,jpos) -= A(i,k) * B(k,jpos);
3140 template<
typename MT3
3143 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3144 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3148 const size_t M( A.rows() );
3149 const size_t N( B.columns() );
3151 for(
size_t i=0UL; i<M; ++i )
3153 const size_t jbegin( ( IsUpper_v<MT4> )
3154 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3156 const size_t jend( ( IsLower_v<MT4> )
3157 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3161 const size_t jnum( jend - jbegin );
3162 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3164 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3165 C(i,j ) -= A(i,j ) * B(j ,j );
3166 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3169 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3190 template<
typename MT3
3193 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3194 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3198 const size_t M( A.rows() );
3199 const size_t N( B.columns() );
3201 for(
size_t i=0UL; i<M; ++i )
3203 const size_t jbegin( ( IsUpper_v<MT5> )
3204 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3206 const size_t jend( ( IsLower_v<MT5> )
3207 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3211 const size_t jnum( jend - jbegin );
3212 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3214 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3215 C(i,j ) -= A(i,i) * B(i,j );
3216 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3219 C(i,jpos) -= A(i,i) * B(i,jpos);
3240 template<
typename MT3
3243 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3244 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3248 for(
size_t i=0UL; i<A.rows(); ++i ) {
3249 C(i,i) -= A(i,i) * B(i,i);
3269 template<
typename MT3
3272 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3273 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3275 selectDefaultSubAssignKernel( C, A, B );
3295 template<
typename MT3
3298 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3299 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3301 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3303 const size_t M( A.rows() );
3304 const size_t N( B.columns() );
3305 const size_t K( A.columns() );
3309 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3314 if( IsIntegral_v<ElementType> )
3317 for(
size_t i=0UL; i<M; ++i )
3319 const size_t kbegin( ( IsUpper_v<MT4> )
3320 ?( ( IsLower_v<MT5> )
3321 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3322 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3323 :( IsLower_v<MT5> ? j : 0UL ) );
3324 const size_t kend( ( IsLower_v<MT4> )
3325 ?( ( IsUpper_v<MT5> )
3326 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3327 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3328 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3339 for(
size_t k=kbegin; k<kend; ++k ) {
3340 const SIMDType a1(
set( A(i,k) ) );
3341 xmm1 -= a1 * B.load(k,j );
3342 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3343 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3344 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3345 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3346 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
3347 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
3348 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
3351 C.store( i, j , xmm1 );
3353 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3354 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3355 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3356 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3357 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3358 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3367 for( ; (i+2UL) <= M; i+=2UL )
3369 const size_t kbegin( ( IsUpper_v<MT4> )
3370 ?( ( IsLower_v<MT5> )
3371 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3372 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3373 :( IsLower_v<MT5> ? j : 0UL ) );
3374 const size_t kend( ( IsLower_v<MT4> )
3375 ?( ( IsUpper_v<MT5> )
3376 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3377 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3378 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3385 SIMDType xmm6 ( C.load(i+1UL,j ) );
3391 for(
size_t k=kbegin; k<kend; ++k ) {
3392 const SIMDType a1(
set( A(i ,k) ) );
3393 const SIMDType a2(
set( A(i+1UL,k) ) );
3411 C.store( i , j , xmm1 );
3413 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3414 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3415 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3416 C.store( i+1UL, j , xmm6 );
3417 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3418 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3419 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3420 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3425 const size_t kbegin( ( IsUpper_v<MT4> )
3426 ?( ( IsLower_v<MT5> )
3427 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3428 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3429 :( IsLower_v<MT5> ? j : 0UL ) );
3430 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3438 for(
size_t k=kbegin; k<kend; ++k ) {
3439 const SIMDType a1(
set( A(i,k) ) );
3440 xmm1 -= a1 * B.load(k,j );
3441 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3442 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3443 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3444 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3447 C.store( i, j , xmm1 );
3449 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3450 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3451 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3459 for( ; (i+2UL) <= M; i+=2UL )
3461 const size_t kbegin( ( IsUpper_v<MT4> )
3462 ?( ( IsLower_v<MT5> )
3463 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3464 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3465 :( IsLower_v<MT5> ? j : 0UL ) );
3466 const size_t kend( ( IsLower_v<MT4> )
3467 ?( ( IsUpper_v<MT5> )
3468 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3469 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3470 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3481 for(
size_t k=kbegin; k<kend; ++k ) {
3482 const SIMDType a1(
set( A(i ,k) ) );
3483 const SIMDType a2(
set( A(i+1UL,k) ) );
3498 C.store( i , j , xmm1 );
3500 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3501 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3502 C.store( i+1UL, j , xmm5 );
3503 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3504 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3505 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3510 const size_t kbegin( ( IsUpper_v<MT4> )
3511 ?( ( IsLower_v<MT5> )
3512 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3513 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3514 :( IsLower_v<MT5> ? j : 0UL ) );
3515 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3522 for(
size_t k=kbegin; k<kend; ++k ) {
3523 const SIMDType a1(
set( A(i,k) ) );
3524 xmm1 -= a1 * B.load(k,j );
3525 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3526 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3527 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3530 C.store( i, j , xmm1 );
3532 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3533 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3541 for( ; (i+2UL) <= M; i+=2UL )
3543 const size_t kbegin( ( IsUpper_v<MT4> )
3544 ?( ( IsLower_v<MT5> )
3545 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3546 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3547 :( IsLower_v<MT5> ? j : 0UL ) );
3548 const size_t kend( ( IsLower_v<MT4> )
3549 ?( ( IsUpper_v<MT5> )
3550 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3551 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3552 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
3561 for(
size_t k=kbegin; k<kend; ++k ) {
3562 const SIMDType a1(
set( A(i ,k) ) );
3563 const SIMDType a2(
set( A(i+1UL,k) ) );
3575 C.store( i , j , xmm1 );
3577 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3578 C.store( i+1UL, j , xmm4 );
3579 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
3580 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
3585 const size_t kbegin( ( IsUpper_v<MT4> )
3586 ?( ( IsLower_v<MT5> )
3587 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3588 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3589 :( IsLower_v<MT5> ? j : 0UL ) );
3590 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
3596 for(
size_t k=kbegin; k<kend; ++k ) {
3597 const SIMDType a1(
set( A(i,k) ) );
3598 xmm1 -= a1 * B.load(k,j );
3599 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3600 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3603 C.store( i, j , xmm1 );
3605 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3612 size_t i(
LOW ? j : 0UL );
3614 for( ; (i+4UL) <= iend; i+=4UL )
3616 const size_t kbegin( ( IsUpper_v<MT4> )
3617 ?( ( IsLower_v<MT5> )
3618 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3619 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3620 :( IsLower_v<MT5> ? j : 0UL ) );
3621 const size_t kend( ( IsLower_v<MT4> )
3622 ?( ( IsUpper_v<MT5> )
3623 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
3624 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3625 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3636 for(
size_t k=kbegin; k<kend; ++k ) {
3637 const SIMDType a1(
set( A(i ,k) ) );
3638 const SIMDType a2(
set( A(i+1UL,k) ) );
3639 const SIMDType a3(
set( A(i+2UL,k) ) );
3640 const SIMDType a4(
set( A(i+3UL,k) ) );
3653 C.store( i , j , xmm1 );
3655 C.store( i+1UL, j , xmm3 );
3656 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3657 C.store( i+2UL, j , xmm5 );
3658 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3659 C.store( i+3UL, j , xmm7 );
3660 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
3663 for( ; (i+3UL) <= iend; i+=3UL )
3665 const size_t kbegin( ( IsUpper_v<MT4> )
3666 ?( ( IsLower_v<MT5> )
3667 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3668 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3669 :( IsLower_v<MT5> ? j : 0UL ) );
3670 const size_t kend( ( IsLower_v<MT4> )
3671 ?( ( IsUpper_v<MT5> )
3672 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
3673 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3674 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3683 for(
size_t k=kbegin; k<kend; ++k ) {
3684 const SIMDType a1(
set( A(i ,k) ) );
3685 const SIMDType a2(
set( A(i+1UL,k) ) );
3686 const SIMDType a3(
set( A(i+2UL,k) ) );
3697 C.store( i , j , xmm1 );
3699 C.store( i+1UL, j , xmm3 );
3700 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3701 C.store( i+2UL, j , xmm5 );
3702 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3705 for( ; (i+2UL) <= iend; i+=2UL )
3707 const size_t kbegin( ( IsUpper_v<MT4> )
3708 ?( ( IsLower_v<MT5> )
3709 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3710 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3711 :( IsLower_v<MT5> ? j : 0UL ) );
3712 const size_t kend( ( IsLower_v<MT4> )
3713 ?( ( IsUpper_v<MT5> )
3714 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
3715 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3716 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3725 for( ; (k+2UL) <= kend; k+=2UL ) {
3726 const SIMDType a1(
set( A(i ,k ) ) );
3727 const SIMDType a2(
set( A(i+1UL,k ) ) );
3728 const SIMDType a3(
set( A(i ,k+1UL) ) );
3729 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
3730 const SIMDType b1( B.load(k ,j ) );
3732 const SIMDType b3( B.load(k+1UL,j ) );
3744 for( ; k<kend; ++k ) {
3745 const SIMDType a1(
set( A(i ,k) ) );
3746 const SIMDType a2(
set( A(i+1UL,k) ) );
3755 C.store( i , j , xmm1+xmm5 );
3756 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
3757 C.store( i+1UL, j , xmm3+xmm7 );
3758 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
3763 const size_t kbegin( ( IsUpper_v<MT4> )
3764 ?( ( IsLower_v<MT5> )
3765 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3766 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3767 :( IsLower_v<MT5> ? j : 0UL ) );
3768 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
3775 for( ; (k+2UL) <= kend; k+=2UL ) {
3776 const SIMDType a1(
set( A(i,k ) ) );
3777 const SIMDType a2(
set( A(i,k+1UL) ) );
3778 xmm1 -= a1 * B.load(k ,j );
3779 xmm2 -= a1 * B.load(k ,j+
SIMDSIZE);
3780 xmm3 -= a2 * B.load(k+1UL,j );
3781 xmm4 -= a2 * B.load(k+1UL,j+
SIMDSIZE);
3784 for( ; k<kend; ++k ) {
3785 const SIMDType a1(
set( A(i,k) ) );
3786 xmm1 -= a1 * B.load(k,j );
3790 C.store( i, j , xmm1+xmm3 );
3791 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
3798 size_t i(
LOW ? j : 0UL );
3800 for( ; (i+4UL) <= iend; i+=4UL )
3802 const size_t kbegin( ( IsUpper_v<MT4> )
3803 ?( ( IsLower_v<MT5> )
3804 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3805 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3806 :( IsLower_v<MT5> ? j : 0UL ) );
3807 const size_t kend( ( IsLower_v<MT4> )
3808 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3818 for( ; (k+2UL) <= kend; k+=2UL ) {
3820 const SIMDType b2( B.load(k+1UL,j) );
3821 xmm1 -=
set( A(i ,k ) ) * b1;
3822 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3823 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3824 xmm4 -=
set( A(i+3UL,k ) ) * b1;
3825 xmm5 -=
set( A(i ,k+1UL) ) * b2;
3826 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
3827 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
3828 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
3831 for( ; k<kend; ++k ) {
3833 xmm1 -=
set( A(i ,k) ) * b1;
3834 xmm2 -=
set( A(i+1UL,k) ) * b1;
3835 xmm3 -=
set( A(i+2UL,k) ) * b1;
3836 xmm4 -=
set( A(i+3UL,k) ) * b1;
3839 C.store( i , j, xmm1+xmm5 );
3840 C.store( i+1UL, j, xmm2+xmm6 );
3841 C.store( i+2UL, j, xmm3+xmm7 );
3842 C.store( i+3UL, j, xmm4+xmm8 );
3845 for( ; (i+3UL) <= iend; i+=3UL )
3847 const size_t kbegin( ( IsUpper_v<MT4> )
3848 ?( ( IsLower_v<MT5> )
3849 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3850 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3851 :( IsLower_v<MT5> ? j : 0UL ) );
3852 const size_t kend( ( IsLower_v<MT4> )
3853 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3862 for( ; (k+2UL) <= kend; k+=2UL ) {
3864 const SIMDType b2( B.load(k+1UL,j) );
3865 xmm1 -=
set( A(i ,k ) ) * b1;
3866 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3867 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3868 xmm4 -=
set( A(i ,k+1UL) ) * b2;
3869 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
3870 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
3873 for( ; k<kend; ++k ) {
3875 xmm1 -=
set( A(i ,k) ) * b1;
3876 xmm2 -=
set( A(i+1UL,k) ) * b1;
3877 xmm3 -=
set( A(i+2UL,k) ) * b1;
3880 C.store( i , j, xmm1+xmm4 );
3881 C.store( i+1UL, j, xmm2+xmm5 );
3882 C.store( i+2UL, j, xmm3+xmm6 );
3885 for( ; (i+2UL) <= iend; i+=2UL )
3887 const size_t kbegin( ( IsUpper_v<MT4> )
3888 ?( ( IsLower_v<MT5> )
3889 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3890 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3891 :( IsLower_v<MT5> ? j : 0UL ) );
3892 const size_t kend( ( IsLower_v<MT4> )
3893 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3901 for( ; (k+2UL) <= kend; k+=2UL ) {
3903 const SIMDType b2( B.load(k+1UL,j) );
3904 xmm1 -=
set( A(i ,k ) ) * b1;
3905 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3906 xmm3 -=
set( A(i ,k+1UL) ) * b2;
3907 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
3910 for( ; k<kend; ++k ) {
3912 xmm1 -=
set( A(i ,k) ) * b1;
3913 xmm2 -=
set( A(i+1UL,k) ) * b1;
3916 C.store( i , j, xmm1+xmm3 );
3917 C.store( i+1UL, j, xmm2+xmm4 );
3922 const size_t kbegin( ( IsUpper_v<MT4> )
3923 ?( ( IsLower_v<MT5> )
3924 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3925 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3926 :( IsLower_v<MT5> ? j : 0UL ) );
3932 for( ; (k+2UL) <= K; k+=2UL ) {
3933 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
3934 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
3938 xmm1 -=
set( A(i,k) ) * B.load(k,j);
3941 C.store( i, j, xmm1+xmm2 );
3945 for( ; remainder && j<N; ++j )
3947 const size_t iend(
UPP ? j+1UL : M );
3948 size_t i(
LOW ? j : 0UL );
3950 for( ; (i+2UL) <= iend; i+=2UL )
3952 const size_t kbegin( ( IsUpper_v<MT4> )
3953 ?( ( IsLower_v<MT5> )
3954 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3955 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3956 :( IsLower_v<MT5> ? j : 0UL ) );
3957 const size_t kend( ( IsLower_v<MT4> )
3958 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3964 for(
size_t k=kbegin; k<kend; ++k ) {
3965 value1 -= A(i ,k) * B(k,j);
3966 value2 -= A(i+1UL,k) * B(k,j);
3970 C(i+1UL,j) = value2;
3975 const size_t kbegin( ( IsUpper_v<MT4> )
3976 ?( ( IsLower_v<MT5> )
3977 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3978 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3979 :( IsLower_v<MT5> ? j : 0UL ) );
3983 for(
size_t k=kbegin; k<K; ++k ) {
3984 value -= A(i,k) * B(k,j);
4009 template<
typename MT3
4012 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4013 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4020 const ForwardFunctor fwd;
4022 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4023 const OppositeType_t<MT4> tmp(
serial( A ) );
4024 subAssign( C, fwd( tmp * B ) );
4026 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4027 const OppositeType_t<MT5> tmp(
serial( B ) );
4028 subAssign( C, fwd( A * tmp ) );
4030 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4031 const OppositeType_t<MT4> tmp(
serial( A ) );
4032 subAssign( C, fwd( tmp * B ) );
4035 const OppositeType_t<MT5> tmp(
serial( B ) );
4036 subAssign( C, fwd( A * tmp ) );
4056 template<
typename MT3
4059 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4060 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4062 selectDefaultSubAssignKernel( C, A, B );
4082 template<
typename MT3
4085 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4086 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4112 template<
typename MT3
4115 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4116 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4118 selectLargeSubAssignKernel( C, A, B );
4124 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4138 template<
typename MT3
4141 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4142 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4144 using ET = ElementType_t<MT3>;
4146 if( IsTriangular_v<MT4> ) {
4147 ResultType_t<MT3> tmp(
serial( B ) );
4148 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4149 subAssign( C, tmp );
4151 else if( IsTriangular_v<MT5> ) {
4152 ResultType_t<MT3> tmp(
serial( A ) );
4153 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4154 subAssign( C, tmp );
4157 gemm( C, A, B, ET(-1), ET(1) );
4179 template<
typename MT >
4180 friend inline auto subAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
4181 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4190 const ForwardFunctor fwd;
4192 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4193 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4194 else if( IsSymmetric_v<MT1> )
4195 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4197 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4219 template<
typename MT
4221 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
4233 schurAssign( ~lhs, tmp );
4265 template<
typename MT
4268 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4275 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4278 else if( rhs.lhs_.columns() == 0UL ) {
4313 template<
typename MT
4316 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4320 using TmpType = If_t< SO, OppositeType, ResultType >;
4332 const ForwardFunctor fwd;
4334 const TmpType tmp( rhs );
4355 template<
typename MT >
4357 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4366 const ForwardFunctor fwd;
4368 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4370 else if( IsSymmetric_v<MT1> )
4394 template<
typename MT
4397 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4404 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4438 template<
typename MT >
4440 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4449 const ForwardFunctor fwd;
4451 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4453 else if( IsSymmetric_v<MT1> )
4481 template<
typename MT
4484 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4491 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4525 template<
typename MT >
4527 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4536 const ForwardFunctor fwd;
4538 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4540 else if( IsSymmetric_v<MT1> )
4565 template<
typename MT
4625 template<
typename MT1
4632 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4633 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4634 ,
private Computation
4639 using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4641 using RES = ResultType_t<MMM>;
4642 using RT1 = ResultType_t<MT1>;
4643 using RT2 = ResultType_t<MT2>;
4644 using ET1 = ElementType_t<RT1>;
4645 using ET2 = ElementType_t<RT2>;
4646 using CT1 = CompositeType_t<MT1>;
4647 using CT2 = CompositeType_t<MT2>;
4652 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4657 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4661 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4662 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4663 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4664 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4673 template<
typename T1,
typename T2,
typename T3 >
4674 static constexpr
bool CanExploitSymmetry_v =
4675 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4683 template<
typename T1,
typename T2,
typename T3 >
4684 static constexpr
bool IsEvaluationRequired_v =
4685 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4692 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4693 static constexpr
bool UseBlasKernel_v =
4695 !SYM && !HERM && !LOW && !UPP &&
4696 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4697 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4698 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4699 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4700 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4701 IsBLASCompatible_v< ElementType_t<T1> > &&
4702 IsBLASCompatible_v< ElementType_t<T2> > &&
4703 IsBLASCompatible_v< ElementType_t<T3> > &&
4704 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4705 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4706 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4713 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4714 static constexpr
bool UseVectorizedDefaultKernel_v =
4715 ( useOptimizedKernels &&
4716 !IsDiagonal_v<T3> &&
4717 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4718 IsSIMDCombinable_v< ElementType_t<T1>
4722 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4723 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4730 using ForwardFunctor =
If_t< HERM
4746 using This = DMatScalarMultExpr<MMM,ST,false>;
4749 using BaseType = DenseMatrix<This,false>;
4753 , DeclHermTrait< MultTrait_t<RES,ST> >
4755 , DeclSymTrait< MultTrait_t<RES,ST> >
4758 , DeclDiagTrait< MultTrait_t<RES,ST> >
4759 , DeclLowTrait< MultTrait_t<RES,ST> > >
4761 , DeclUppTrait< MultTrait_t<RES,ST> >
4762 , MultTrait<RES,ST> > > > >::Type;
4767 using SIMDType = SIMDTrait_t<ElementType>;
4772 using LeftOperand =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4778 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4781 using RT = If_t< evaluateRight, const RT2, CT2 >;
4787 ( !IsDiagonal_v<MT2> &&
4788 MT1::simdEnabled && MT2::simdEnabled &&
4789 IsSIMDCombinable_v<ET1,ET2,ST> &&
4790 HasSIMDAdd_v<ET1,ET2> &&
4791 HasSIMDMult_v<ET1,ET2> );
4841 if( j >=
matrix_.columns() ) {
4844 return (*
this)(i,j);
4853 inline size_t rows()
const {
4863 inline size_t columns()
const {
4894 template<
typename T >
4895 inline bool canAlias(
const T* alias )
const {
4896 return matrix_.canAlias( alias );
4906 template<
typename T >
4907 inline bool isAliased(
const T* alias )
const {
4908 return matrix_.isAliased( alias );
4929 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4931 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
4932 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
4954 template<
typename MT
4957 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4964 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4965 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4967 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4970 else if( left.columns() == 0UL ) {
4985 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5000 template<
typename MT3
5004 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5006 if( ( IsDiagonal_v<MT5> ) ||
5007 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
5008 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5009 selectSmallAssignKernel( C, A, B, scalar );
5011 selectBlasAssignKernel( C, A, B, scalar );
5029 template<
typename MT3
5033 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5034 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5036 const size_t M( A.rows() );
5037 const size_t N( B.columns() );
5038 const size_t K( A.columns() );
5042 for(
size_t i=0UL; i<M; ++i )
5044 const size_t kbegin( ( IsUpper_v<MT4> )
5045 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5047 const size_t kend( ( IsLower_v<MT4> )
5048 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5052 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5053 for(
size_t j=0UL; j<N; ++j ) {
5060 const size_t jbegin( ( IsUpper_v<MT5> )
5061 ?( ( IsStrictlyUpper_v<MT5> )
5062 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
5063 :( UPP ?
max(i,kbegin) : kbegin ) )
5064 :( UPP ? i : 0UL ) );
5065 const size_t jend( ( IsLower_v<MT5> )
5066 ?( ( IsStrictlyLower_v<MT5> )
5067 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
5068 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
5069 :( LOW ? i+1UL : N ) );
5071 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5072 for(
size_t j=0UL; j<jbegin; ++j ) {
5076 else if( IsStrictlyUpper_v<MT5> ) {
5079 for(
size_t j=jbegin; j<jend; ++j ) {
5080 C(i,j) = A(i,kbegin) * B(kbegin,j);
5082 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5083 for(
size_t j=jend; j<N; ++j ) {
5087 else if( IsStrictlyLower_v<MT5> ) {
5088 reset( C(i,N-1UL) );
5092 for(
size_t k=kbegin+1UL; k<kend; ++k )
5094 const size_t jbegin( ( IsUpper_v<MT5> )
5095 ?( ( IsStrictlyUpper_v<MT5> )
5096 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
5097 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
5098 :( SYM || HERM || UPP ? i : 0UL ) );
5099 const size_t jend( ( IsLower_v<MT5> )
5100 ?( ( IsStrictlyLower_v<MT5> )
5101 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
5102 :( LOW ?
min(i+1UL,k) : k ) )
5103 :( LOW ? i+1UL : N ) );
5105 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5108 for(
size_t j=jbegin; j<jend; ++j ) {
5109 C(i,j) += A(i,k) * B(k,j);
5111 if( IsLower_v<MT5> ) {
5112 C(i,jend) = A(i,k) * B(k,jend);
5117 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5118 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5119 :( SYM || HERM || UPP ? i : 0UL ) );
5120 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5121 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5122 :( LOW ? i+1UL : N ) );
5124 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5127 for(
size_t j=jbegin; j<jend; ++j ) {
5134 for(
size_t i=1UL; i<M; ++i ) {
5135 for(
size_t j=0UL; j<i; ++j ) {
5136 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5157 template<
typename MT3
5161 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5162 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5166 const size_t M( A.rows() );
5167 const size_t N( B.columns() );
5169 for(
size_t i=0UL; i<M; ++i )
5171 const size_t jbegin( ( IsUpper_v<MT4> )
5172 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5174 const size_t jend( ( IsLower_v<MT4> )
5175 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5179 if( IsUpper_v<MT4> ) {
5180 for(
size_t j=0UL; j<jbegin; ++j ) {
5184 for(
size_t j=jbegin; j<jend; ++j ) {
5185 C(i,j) = A(i,j) * B(j,j) * scalar;
5187 if( IsLower_v<MT4> ) {
5188 for(
size_t j=jend; j<N; ++j ) {
5210 template<
typename MT3
5214 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5215 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5219 const size_t M( A.rows() );
5220 const size_t N( B.columns() );
5222 for(
size_t i=0UL; i<M; ++i )
5224 const size_t jbegin( ( IsUpper_v<MT5> )
5225 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5227 const size_t jend( ( IsLower_v<MT5> )
5228 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5232 if( IsUpper_v<MT5> ) {
5233 for(
size_t j=0UL; j<jbegin; ++j ) {
5237 for(
size_t j=jbegin; j<jend; ++j ) {
5238 C(i,j) = A(i,i) * B(i,j) * scalar;
5240 if( IsLower_v<MT5> ) {
5241 for(
size_t j=jend; j<N; ++j ) {
5263 template<
typename MT3
5267 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5268 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5274 for(
size_t i=0UL; i<A.rows(); ++i ) {
5275 C(i,i) = A(i,i) * B(i,i) * scalar;
5294 template<
typename MT3
5298 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5299 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5301 selectDefaultAssignKernel( C, A, B, scalar );
5320 template<
typename MT3
5324 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5325 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5327 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5329 const size_t M( A.rows() );
5330 const size_t N( B.columns() );
5331 const size_t K( A.columns() );
5335 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
5338 const SIMDType factor(
set( scalar ) );
5340 if( LOW && UPP && N >
SIMDSIZE*3UL ) {
5347 if( IsIntegral_v<ElementType> )
5349 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
5350 for(
size_t i=0UL; i<M; ++i )
5352 const size_t kbegin( ( IsUpper_v<MT4> )
5353 ?( ( IsLower_v<MT5> )
5354 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5355 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5356 :( IsLower_v<MT5> ? j : 0UL ) );
5357 const size_t kend( ( IsLower_v<MT4> )
5358 ?( ( IsUpper_v<MT5> )
5359 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5360 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5361 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5363 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5365 for(
size_t k=kbegin; k<kend; ++k ) {
5366 const SIMDType a1(
set( A(i,k) ) );
5367 xmm1 += a1 * B.load(k,j );
5368 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5369 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5370 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5371 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5372 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
5373 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
5374 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
5377 C.store( i, j , xmm1 * factor );
5378 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5379 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5380 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5381 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5382 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
5383 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
5384 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
5389 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
5393 for( ; (i+2UL) <= M; i+=2UL )
5395 const size_t kbegin( ( IsUpper_v<MT4> )
5396 ?( ( IsLower_v<MT5> )
5397 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5398 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5399 :( IsLower_v<MT5> ? j : 0UL ) );
5400 const size_t kend( ( IsLower_v<MT4> )
5401 ?( ( IsUpper_v<MT5> )
5402 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5403 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5404 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5406 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5408 for(
size_t k=kbegin; k<kend; ++k ) {
5409 const SIMDType a1(
set( A(i ,k) ) );
5410 const SIMDType a2(
set( A(i+1UL,k) ) );
5411 const SIMDType b1( B.load(k,j ) );
5412 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5413 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5414 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5415 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
5428 C.store( i , j , xmm1 * factor );
5429 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5430 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5431 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5432 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
5433 C.store( i+1UL, j , xmm6 * factor );
5434 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
5435 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
5436 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
5437 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
5442 const size_t kbegin( ( IsUpper_v<MT4> )
5443 ?( ( IsLower_v<MT5> )
5444 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5445 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5446 :( IsLower_v<MT5> ? j : 0UL ) );
5447 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5449 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5451 for(
size_t k=kbegin; k<kend; ++k ) {
5452 const SIMDType a1(
set( A(i,k) ) );
5453 xmm1 += a1 * B.load(k,j );
5454 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5455 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5456 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5457 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5460 C.store( i, j , xmm1 * factor );
5461 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5462 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5463 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5464 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5470 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
5471 size_t i( LOW ? j : 0UL );
5473 for( ; (i+2UL) <= iend; i+=2UL )
5475 const size_t kbegin( ( IsUpper_v<MT4> )
5476 ?( ( IsLower_v<MT5> )
5477 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5478 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5479 :( IsLower_v<MT5> ? j : 0UL ) );
5480 const size_t kend( ( IsLower_v<MT4> )
5481 ?( ( IsUpper_v<MT5> )
5482 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5483 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5484 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5486 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5488 for(
size_t k=kbegin; k<kend; ++k ) {
5489 const SIMDType a1(
set( A(i ,k) ) );
5490 const SIMDType a2(
set( A(i+1UL,k) ) );
5491 const SIMDType b1( B.load(k,j ) );
5492 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5493 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5494 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5505 C.store( i , j , xmm1 * factor );
5506 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5507 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5508 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5509 C.store( i+1UL, j , xmm5 * factor );
5510 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
5511 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
5512 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
5517 const size_t kbegin( ( IsUpper_v<MT4> )
5518 ?( ( IsLower_v<MT5> )
5519 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5520 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5521 :( IsLower_v<MT5> ? j : 0UL ) );
5522 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5524 SIMDType xmm1, xmm2, xmm3, xmm4;
5526 for(
size_t k=kbegin; k<kend; ++k ) {
5527 const SIMDType a1(
set( A(i,k) ) );
5528 xmm1 += a1 * B.load(k,j );
5529 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5530 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5531 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5534 C.store( i, j , xmm1 * factor );
5535 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5536 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5537 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5543 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
5544 size_t i( LOW ? j : 0UL );
5546 for( ; (i+2UL) <= iend; i+=2UL )
5548 const size_t kbegin( ( IsUpper_v<MT4> )
5549 ?( ( IsLower_v<MT5> )
5550 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5551 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5552 :( IsLower_v<MT5> ? j : 0UL ) );
5553 const size_t kend( ( IsLower_v<MT4> )
5554 ?( ( IsUpper_v<MT5> )
5555 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5556 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5557 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5559 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5561 for(
size_t k=kbegin; k<kend; ++k ) {
5562 const SIMDType a1(
set( A(i ,k) ) );
5563 const SIMDType a2(
set( A(i+1UL,k) ) );
5564 const SIMDType b1( B.load(k,j ) );
5565 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5566 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5575 C.store( i , j , xmm1 * factor );
5576 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5577 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5578 C.store( i+1UL, j , xmm4 * factor );
5579 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
5580 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
5585 const size_t kbegin( ( IsUpper_v<MT4> )
5586 ?( ( IsLower_v<MT5> )
5587 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5588 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5589 :( IsLower_v<MT5> ? j : 0UL ) );
5590 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5592 SIMDType xmm1, xmm2, xmm3;
5594 for(
size_t k=kbegin; k<kend; ++k ) {
5595 const SIMDType a1(
set( A(i,k) ) );
5596 xmm1 += a1 * B.load(k,j );
5597 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5598 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5601 C.store( i, j , xmm1 * factor );
5602 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5603 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5609 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
5610 size_t i( LOW ? j : 0UL );
5612 for( ; (i+4UL) <= iend; i+=4UL )
5614 const size_t kbegin( ( IsUpper_v<MT4> )
5615 ?( ( IsLower_v<MT5> )
5616 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5617 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5618 :( IsLower_v<MT5> ? j : 0UL ) );
5619 const size_t kend( ( IsLower_v<MT4> )
5620 ?( ( IsUpper_v<MT5> )
5621 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
5622 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5623 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5625 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5627 for(
size_t k=kbegin; k<kend; ++k ) {
5628 const SIMDType a1(
set( A(i ,k) ) );
5629 const SIMDType a2(
set( A(i+1UL,k) ) );
5630 const SIMDType a3(
set( A(i+2UL,k) ) );
5631 const SIMDType a4(
set( A(i+3UL,k) ) );
5632 const SIMDType b1( B.load(k,j ) );
5633 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5644 C.store( i , j , xmm1 * factor );
5645 C.store( i , j+
SIMDSIZE, xmm2 * factor );
5646 C.store( i+1UL, j , xmm3 * factor );
5647 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
5648 C.store( i+2UL, j , xmm5 * factor );
5649 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
5650 C.store( i+3UL, j , xmm7 * factor );
5651 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
5654 for( ; (i+3UL) <= iend; i+=3UL )
5656 const size_t kbegin( ( IsUpper_v<MT4> )
5657 ?( ( IsLower_v<MT5> )
5658 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5659 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5660 :( IsLower_v<MT5> ? j : 0UL ) );
5661 const size_t kend( ( IsLower_v<MT4> )
5662 ?( ( IsUpper_v<MT5> )
5663 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
5664 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5665 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5667 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5669 for(
size_t k=kbegin; k<kend; ++k ) {
5670 const SIMDType a1(
set( A(i ,k) ) );
5671 const SIMDType a2(
set( A(i+1UL,k) ) );
5672 const SIMDType a3(
set( A(i+2UL,k) ) );
5673 const SIMDType b1( B.load(k,j ) );
5674 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5683 C.store( i , j , xmm1 * factor );
5684 C.store( i , j+
SIMDSIZE, xmm2 * factor );
5685 C.store( i+1UL, j , xmm3 * factor );
5686 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
5687 C.store( i+2UL, j , xmm5 * factor );
5688 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
5691 for( ; (i+2UL) <= iend; i+=2UL )
5693 const size_t kbegin( ( IsUpper_v<MT4> )
5694 ?( ( IsLower_v<MT5> )
5695 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5696 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5697 :( IsLower_v<MT5> ? j : 0UL ) );
5698 const size_t kend( ( IsLower_v<MT4> )
5699 ?( ( IsUpper_v<MT5> )
5700 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
5701 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5702 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5704 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5707 for( ; (k+2UL) <= kend; k+=2UL ) {
5708 const SIMDType a1(
set( A(i ,k ) ) );
5709 const SIMDType a2(
set( A(i+1UL,k ) ) );
5710 const SIMDType a3(
set( A(i ,k+1UL) ) );
5711 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5712 const SIMDType b1( B.load(k ,j ) );
5713 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
5714 const SIMDType b3( B.load(k+1UL,j ) );
5715 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
5726 for( ; k<kend; ++k ) {
5727 const SIMDType a1(
set( A(i ,k) ) );
5728 const SIMDType a2(
set( A(i+1UL,k) ) );
5729 const SIMDType b1( B.load(k,j ) );
5730 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5737 C.store( i , j , (xmm1+xmm5) * factor );
5738 C.store( i , j+
SIMDSIZE, (xmm2+xmm6) * factor );
5739 C.store( i+1UL, j , (xmm3+xmm7) * factor );
5740 C.store( i+1UL, j+
SIMDSIZE, (xmm4+xmm8) * factor );
5745 const size_t kbegin( ( IsUpper_v<MT4> )
5746 ?( ( IsLower_v<MT5> )
5747 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5748 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5749 :( IsLower_v<MT5> ? j : 0UL ) );
5750 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
5752 SIMDType xmm1, xmm2, xmm3, xmm4;
5755 for( ; (k+2UL) <= kend; k+=2UL ) {
5756 const SIMDType a1(
set( A(i,k ) ) );
5757 const SIMDType a2(
set( A(i,k+1UL) ) );
5758 xmm1 += a1 * B.load(k ,j );
5759 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
5760 xmm3 += a2 * B.load(k+1UL,j );
5761 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
5764 for( ; k<kend; ++k ) {
5765 const SIMDType a1(
set( A(i,k) ) );
5766 xmm1 += a1 * B.load(k,j );
5770 C.store( i, j , (xmm1+xmm3) * factor );
5771 C.store( i, j+
SIMDSIZE, (xmm2+xmm4) * factor );
5777 const size_t iend( SYM || HERM || UPP ?
min(j+
SIMDSIZE,M) : M );
5778 size_t i( LOW ? j : 0UL );
5780 for( ; (i+4UL) <= iend; i+=4UL )
5782 const size_t kbegin( ( IsUpper_v<MT4> )
5783 ?( ( IsLower_v<MT5> )
5784 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5785 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5786 :( IsLower_v<MT5> ? j : 0UL ) );
5787 const size_t kend( ( IsLower_v<MT4> )
5788 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5791 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5794 for( ; (k+2UL) <= kend; k+=2UL ) {
5795 const SIMDType b1( B.load(k ,j) );
5796 const SIMDType b2( B.load(k+1UL,j) );
5797 xmm1 +=
set( A(i ,k ) ) * b1;
5798 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5799 xmm3 +=
set( A(i+2UL,k ) ) * b1;
5800 xmm4 +=
set( A(i+3UL,k ) ) * b1;
5801 xmm5 +=
set( A(i ,k+1UL) ) * b2;
5802 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
5803 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
5804 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
5807 for( ; k<kend; ++k ) {
5808 const SIMDType b1( B.load(k,j) );
5809 xmm1 +=
set( A(i ,k) ) * b1;
5810 xmm2 +=
set( A(i+1UL,k) ) * b1;
5811 xmm3 +=
set( A(i+2UL,k) ) * b1;
5812 xmm4 +=
set( A(i+3UL,k) ) * b1;
5815 C.store( i , j, (xmm1+xmm5) * factor );
5816 C.store( i+1UL, j, (xmm2+xmm6) * factor );
5817 C.store( i+2UL, j, (xmm3+xmm7) * factor );
5818 C.store( i+3UL, j, (xmm4+xmm8) * factor );
5821 for( ; (i+3UL) <= iend; i+=3UL )
5823 const size_t kbegin( ( IsUpper_v<MT4> )
5824 ?( ( IsLower_v<MT5> )
5825 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5826 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5827 :( IsLower_v<MT5> ? j : 0UL ) );
5828 const size_t kend( ( IsLower_v<MT4> )
5829 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
5832 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5835 for( ; (k+2UL) <= kend; k+=2UL ) {
5836 const SIMDType b1( B.load(k ,j) );
5837 const SIMDType b2( B.load(k+1UL,j) );
5838 xmm1 +=
set( A(i ,k ) ) * b1;
5839 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5840 xmm3 +=
set( A(i+2UL,k ) ) * b1;
5841 xmm4 +=
set( A(i ,k+1UL) ) * b2;
5842 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
5843 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
5846 for( ; k<kend; ++k ) {
5847 const SIMDType b1( B.load(k,j) );
5848 xmm1 +=
set( A(i ,k) ) * b1;
5849 xmm2 +=
set( A(i+1UL,k) ) * b1;
5850 xmm3 +=
set( A(i+2UL,k) ) * b1;
5853 C.store( i , j, (xmm1+xmm4) * factor );
5854 C.store( i+1UL, j, (xmm2+xmm5) * factor );
5855 C.store( i+2UL, j, (xmm3+xmm6) * factor );
5858 for( ; (i+2UL) <= iend; i+=2UL )
5860 const size_t kbegin( ( IsUpper_v<MT4> )
5861 ?( ( IsLower_v<MT5> )
5862 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5863 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5864 :( IsLower_v<MT5> ? j : 0UL ) );
5865 const size_t kend( ( IsLower_v<MT4> )
5866 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5869 SIMDType xmm1, xmm2, xmm3, xmm4;
5872 for( ; (k+2UL) <= kend; k+=2UL ) {
5873 const SIMDType b1( B.load(k ,j) );
5874 const SIMDType b2( B.load(k+1UL,j) );
5875 xmm1 +=
set( A(i ,k ) ) * b1;
5876 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5877 xmm3 +=
set( A(i ,k+1UL) ) * b2;
5878 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
5881 for( ; k<kend; ++k ) {
5882 const SIMDType b1( B.load(k,j) );
5883 xmm1 +=
set( A(i ,k) ) * b1;
5884 xmm2 +=
set( A(i+1UL,k) ) * b1;
5887 C.store( i , j, (xmm1+xmm3) * factor );
5888 C.store( i+1UL, j, (xmm2+xmm4) * factor );
5893 const size_t kbegin( ( IsUpper_v<MT4> )
5894 ?( ( IsLower_v<MT5> )
5895 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5896 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5897 :( IsLower_v<MT5> ? j : 0UL ) );
5899 SIMDType xmm1, xmm2;
5902 for( ; (k+2UL) <= K; k+=2UL ) {
5903 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
5904 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
5908 xmm1 +=
set( A(i,k) ) * B.load(k,j);
5911 C.store( i, j, (xmm1+xmm2) * factor );
5915 for( ; remainder && j<N; ++j )
5917 size_t i( LOW && UPP ? j : 0UL );
5919 for( ; (i+2UL) <= M; i+=2UL )
5921 const size_t kbegin( ( IsUpper_v<MT4> )
5922 ?( ( IsLower_v<MT5> )
5923 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5924 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5925 :( IsLower_v<MT5> ? j : 0UL ) );
5926 const size_t kend( ( IsLower_v<MT4> )
5927 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
5933 for(
size_t k=kbegin; k<kend; ++k ) {
5934 value1 += A(i ,k) * B(k,j);
5935 value2 += A(i+1UL,k) * B(k,j);
5938 C(i ,j) = value1 * scalar;
5939 C(i+1UL,j) = value2 * scalar;
5944 const size_t kbegin( ( IsUpper_v<MT4> )
5945 ?( ( IsLower_v<MT5> )
5946 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5947 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5948 :( IsLower_v<MT5> ? j : 0UL ) );
5952 for(
size_t k=kbegin; k<K; ++k ) {
5953 value += A(i,k) * B(k,j);
5956 C(i,j) = value * scalar;
5961 if( ( SYM || HERM ) && ( N >
SIMDSIZE*4UL ) ) {
5962 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
5964 for(
size_t j=0UL; j<jend; ++j ) {
5965 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5969 else if( LOW && !UPP && N >
SIMDSIZE*4UL ) {
5970 for(
size_t j=
SIMDSIZE*4UL; j<N; ++j ) {
5972 for(
size_t i=0UL; i<iend; ++i ) {
5977 else if( !LOW && UPP && N >
SIMDSIZE*4UL ) {
5978 for(
size_t i=
SIMDSIZE*4UL; i<M; ++i ) {
5980 for(
size_t j=0UL; j<jend; ++j ) {
6003 template<
typename MT3
6007 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6008 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6015 const ForwardFunctor fwd;
6017 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6018 const OppositeType_t<MT4> tmp(
serial( A ) );
6019 assign( C, fwd( tmp * B ) * scalar );
6021 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6022 const OppositeType_t<MT5> tmp(
serial( B ) );
6023 assign( C, fwd( A * tmp ) * scalar );
6025 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6026 const OppositeType_t<MT4> tmp(
serial( A ) );
6027 assign( C, fwd( tmp * B ) * scalar );
6030 const OppositeType_t<MT5> tmp(
serial( B ) );
6031 assign( C, fwd( A * tmp ) * scalar );
6050 template<
typename MT3
6054 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6055 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6057 selectDefaultAssignKernel( C, A, B, scalar );
6076 template<
typename MT3
6080 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6081 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6084 smmm( C, A, B, scalar );
6086 hmmm( C, A, B, scalar );
6088 lmmm( C, A, B, scalar, ST2(0) );
6090 ummm( C, A, B, scalar, ST2(0) );
6092 mmm( C, A, B, scalar, ST2(0) );
6110 template<
typename MT3
6114 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6115 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6117 selectLargeAssignKernel( C, A, B, scalar );
6122 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6136 template<
typename MT3
6140 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6141 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6143 using ET = ElementType_t<MT3>;
6145 if( IsTriangular_v<MT4> ) {
6147 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6149 else if( IsTriangular_v<MT5> ) {
6151 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6154 gemm( C, A, B,
ET(scalar),
ET(0) );
6172 template<
typename MT
6175 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6179 using TmpType = If_t< SO, OppositeType, ResultType >;
6191 const ForwardFunctor fwd;
6193 const TmpType tmp(
serial( rhs ) );
6194 assign( ~lhs, fwd( tmp ) );
6212 template<
typename MT >
6214 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6223 const ForwardFunctor fwd;
6225 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6226 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6228 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6229 assign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
6230 else if( IsSymmetric_v<MT1> )
6231 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
6233 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
6249 template<
typename MT
6251 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6252 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6259 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6260 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6262 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6276 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6291 template<
typename MT3
6295 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6297 if( ( IsDiagonal_v<MT5> ) ||
6298 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
6299 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6300 selectSmallAddAssignKernel( C, A, B, scalar );
6302 selectBlasAddAssignKernel( C, A, B, scalar );
6320 template<
typename MT3
6324 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6325 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6328 addAssign( C, tmp );
6346 template<
typename MT3
6350 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6351 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6355 const size_t M( A.rows() );
6356 const size_t N( B.columns() );
6358 for(
size_t i=0UL; i<M; ++i )
6360 const size_t jbegin( ( IsUpper_v<MT4> )
6361 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6363 const size_t jend( ( IsLower_v<MT4> )
6364 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6368 const size_t jnum( jend - jbegin );
6369 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6371 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6372 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6373 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6376 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6396 template<
typename MT3
6400 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6401 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6405 const size_t M( A.rows() );
6406 const size_t N( B.columns() );
6408 for(
size_t i=0UL; i<M; ++i )
6410 const size_t jbegin( ( IsUpper_v<MT5> )
6411 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6413 const size_t jend( ( IsLower_v<MT5> )
6414 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6418 const size_t jnum( jend - jbegin );
6419 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6421 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6422 C(i,j ) += A(i,i) * B(i,j ) * scalar;
6423 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6426 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6446 template<
typename MT3
6450 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6451 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6455 for(
size_t i=0UL; i<A.rows(); ++i ) {
6456 C(i,i) += A(i,i) * B(i,i) * scalar;
6475 template<
typename MT3
6479 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6480 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6482 selectDefaultAddAssignKernel( C, A, B, scalar );
6501 template<
typename MT3
6505 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6506 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6508 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
6510 const size_t M( A.rows() );
6511 const size_t N( B.columns() );
6512 const size_t K( A.columns() );
6516 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
6519 const SIMDType factor(
set( scalar ) );
6523 if( IsIntegral_v<ElementType> )
6526 for(
size_t i=0UL; i<M; ++i )
6528 const size_t kbegin( ( IsUpper_v<MT4> )
6529 ?( ( IsLower_v<MT5> )
6530 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6531 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6532 :( IsLower_v<MT5> ? j : 0UL ) );
6533 const size_t kend( ( IsLower_v<MT4> )
6534 ?( ( IsUpper_v<MT5> )
6535 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
6536 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
6537 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
6539 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6541 for(
size_t k=kbegin; k<kend; ++k ) {
6542 const SIMDType a1(
set( A(i,k) ) );
6543 xmm1 += a1 * B.load(k,j );
6544 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6545 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6546 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6547 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
6548 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
6549 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
6550 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
6553 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6569 for( ; (i+2UL) <= M; i+=2UL )
6571 const size_t kbegin( ( IsUpper_v<MT4> )
6572 ?( ( IsLower_v<MT5> )
6573 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6574 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6575 :( IsLower_v<MT5> ? j : 0UL ) );
6576 const size_t kend( ( IsLower_v<MT4> )
6577 ?( ( IsUpper_v<MT5> )
6578 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
6579 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6580 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
6582 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6584 for(
size_t k=kbegin; k<kend; ++k ) {
6585 const SIMDType a1(
set( A(i ,k) ) );
6586 const SIMDType a2(
set( A(i+1UL,k) ) );
6587 const SIMDType b1( B.load(k,j ) );
6588 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6589 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6590 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
6591 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
6604 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6609 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
6611 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
6612 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
6613 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
6618 const size_t kbegin( ( IsUpper_v<MT4> )
6619 ?( ( IsLower_v<MT5> )
6620 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6621 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6622 :( IsLower_v<MT5> ? j : 0UL ) );
6623 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
6625 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6627 for(
size_t k=kbegin; k<kend; ++k ) {
6628 const SIMDType a1(
set( A(i,k) ) );
6629 xmm1 += a1 * B.load(k,j );
6630 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6631 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6632 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6633 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
6636 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6648 for( ; (i+2UL) <= M; i+=2UL )
6650 const size_t kbegin( ( IsUpper_v<MT4> )
6651 ?( ( IsLower_v<MT5> )
6652 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6653 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6654 :( IsLower_v<MT5> ? j : 0UL ) );
6655 const size_t kend( ( IsLower_v<MT4> )
6656 ?( ( IsUpper_v<MT5> )
6657 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
6658 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6659 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
6661 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6663 for(
size_t k=kbegin; k<kend; ++k ) {
6664 const SIMDType a1(
set( A(i ,k) ) );
6665 const SIMDType a2(
set( A(i+1UL,k) ) );
6666 const SIMDType b1( B.load(k,j ) );
6667 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6668 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6669 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
6680 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6684 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
6686 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
6687 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
6692 const size_t kbegin( ( IsUpper_v<MT4> )
6693 ?( ( IsLower_v<MT5> )
6694 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6695 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6696 :( IsLower_v<MT5> ? j : 0UL ) );
6697 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
6699 SIMDType xmm1, xmm2, xmm3, xmm4;
6701 for(
size_t k=kbegin; k<kend; ++k ) {
6702 const SIMDType a1(
set( A(i,k) ) );
6703 xmm1 += a1 * B.load(k,j );
6704 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6705 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6706 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6709 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6720 for( ; (i+2UL) <= M; i+=2UL )
6722 const size_t kbegin( ( IsUpper_v<MT4> )
6723 ?( ( IsLower_v<MT5> )
6724 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6725 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6726 :( IsLower_v<MT5> ? j : 0UL ) );
6727 const size_t kend( ( IsLower_v<MT4> )
6728 ?( ( IsUpper_v<MT5> )
6729 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
6730 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6731 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
6733 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6735 for(
size_t k=kbegin; k<kend; ++k ) {
6736 const SIMDType a1(
set( A(i ,k) ) );
6737 const SIMDType a2(
set( A(i+1UL,k) ) );
6738 const SIMDType b1( B.load(k,j ) );
6739 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6740 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6749 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6752 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
6754 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
6759 const size_t kbegin( ( IsUpper_v<MT4> )
6760 ?( ( IsLower_v<MT5> )
6761 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6762 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6763 :( IsLower_v<MT5> ? j : 0UL ) );
6764 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
6766 SIMDType xmm1, xmm2, xmm3;
6768 for(
size_t k=kbegin; k<kend; ++k ) {
6769 const SIMDType a1(
set( A(i,k) ) );
6770 xmm1 += a1 * B.load(k,j );
6771 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6772 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6775 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6783 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
6784 size_t i( LOW ? j : 0UL );
6786 for( ; (i+4UL) <= iend; i+=4UL )
6788 const size_t kbegin( ( IsUpper_v<MT4> )
6789 ?( ( IsLower_v<MT5> )
6790 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6791 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6792 :( IsLower_v<MT5> ? j : 0UL ) );
6793 const size_t kend( ( IsLower_v<MT4> )
6794 ?( ( IsUpper_v<MT5> )
6795 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
6796 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6797 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6799 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6801 for(
size_t k=kbegin; k<kend; ++k ) {
6802 const SIMDType a1(
set( A(i ,k) ) );
6803 const SIMDType a2(
set( A(i+1UL,k) ) );
6804 const SIMDType a3(
set( A(i+2UL,k) ) );
6805 const SIMDType a4(
set( A(i+3UL,k) ) );
6806 const SIMDType b1( B.load(k,j ) );
6807 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
6818 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6820 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
6822 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
6824 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
6828 for( ; (i+3UL) <= iend; i+=3UL )
6830 const size_t kbegin( ( IsUpper_v<MT4> )
6831 ?( ( IsLower_v<MT5> )
6832 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6833 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6834 :( IsLower_v<MT5> ? j : 0UL ) );
6835 const size_t kend( ( IsLower_v<MT4> )
6836 ?( ( IsUpper_v<MT5> )
6837 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
6838 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
6839 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6841 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6843 for(
size_t k=kbegin; k<kend; ++k ) {
6844 const SIMDType a1(
set( A(i ,k) ) );
6845 const SIMDType a2(
set( A(i+1UL,k) ) );
6846 const SIMDType a3(
set( A(i+2UL,k) ) );
6847 const SIMDType b1( B.load(k,j ) );
6848 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
6857 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6859 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
6861 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
6865 for( ; (i+2UL) <= iend; i+=2UL )
6867 const size_t kbegin( ( IsUpper_v<MT4> )
6868 ?( ( IsLower_v<MT5> )
6869 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6870 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6871 :( IsLower_v<MT5> ? j : 0UL ) );
6872 const size_t kend( ( IsLower_v<MT4> )
6873 ?( ( IsUpper_v<MT5> )
6874 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
6875 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6876 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6878 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6881 for( ; (k+2UL) <= kend; k+=2UL ) {
6882 const SIMDType a1(
set( A(i ,k ) ) );
6883 const SIMDType a2(
set( A(i+1UL,k ) ) );
6884 const SIMDType a3(
set( A(i ,k+1UL) ) );
6885 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
6886 const SIMDType b1( B.load(k ,j ) );
6887 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
6888 const SIMDType b3( B.load(k+1UL,j ) );
6889 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
6900 for( ; k<kend; ++k ) {
6901 const SIMDType a1(
set( A(i ,k) ) );
6902 const SIMDType a2(
set( A(i+1UL,k) ) );
6903 const SIMDType b1( B.load(k,j ) );
6904 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
6911 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
6913 C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
6914 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) + (xmm4+xmm8) * factor );
6919 const size_t kbegin( ( IsUpper_v<MT4> )
6920 ?( ( IsLower_v<MT5> )
6921 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6922 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6923 :( IsLower_v<MT5> ? j : 0UL ) );
6924 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
6926 SIMDType xmm1, xmm2, xmm3, xmm4;
6929 for( ; (k+2UL) <= kend; k+=2UL ) {
6930 const SIMDType a1(
set( A(i,k ) ) );
6931 const SIMDType a2(
set( A(i,k+1UL) ) );
6932 xmm1 += a1 * B.load(k ,j );
6933 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
6934 xmm3 += a2 * B.load(k+1UL,j );
6935 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
6938 for( ; k<kend; ++k ) {
6939 const SIMDType a1(
set( A(i,k) ) );
6940 xmm1 += a1 * B.load(k,j );
6944 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
6951 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
6952 size_t i( LOW ? j : 0UL );
6954 for( ; (i+4UL) <= iend; i+=4UL )
6956 const size_t kbegin( ( IsUpper_v<MT4> )
6957 ?( ( IsLower_v<MT5> )
6958 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6959 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6960 :( IsLower_v<MT5> ? j : 0UL ) );
6961 const size_t kend( ( IsLower_v<MT4> )
6962 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6965 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6968 for( ; (k+2UL) <= kend; k+=2UL ) {
6969 const SIMDType b1( B.load(k ,j) );
6970 const SIMDType b2( B.load(k+1UL,j) );
6971 xmm1 +=
set( A(i ,k ) ) * b1;
6972 xmm2 +=
set( A(i+1UL,k ) ) * b1;
6973 xmm3 +=
set( A(i+2UL,k ) ) * b1;
6974 xmm4 +=
set( A(i+3UL,k ) ) * b1;
6975 xmm5 +=
set( A(i ,k+1UL) ) * b2;
6976 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
6977 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
6978 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
6981 for( ; k<kend; ++k ) {
6982 const SIMDType b1( B.load(k,j) );
6983 xmm1 +=
set( A(i ,k) ) * b1;
6984 xmm2 +=
set( A(i+1UL,k) ) * b1;
6985 xmm3 +=
set( A(i+2UL,k) ) * b1;
6986 xmm4 +=
set( A(i+3UL,k) ) * b1;
6989 C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
6990 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
6991 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
6992 C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
6995 for( ; (i+3UL) <= iend; i+=3UL )
6997 const size_t kbegin( ( IsUpper_v<MT4> )
6998 ?( ( IsLower_v<MT5> )
6999 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7000 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7001 :( IsLower_v<MT5> ? j : 0UL ) );
7002 const size_t kend( ( IsLower_v<MT4> )
7003 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7006 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7009 for( ; (k+2UL) <= kend; k+=2UL ) {
7010 const SIMDType b1( B.load(k ,j) );
7011 const SIMDType b2( B.load(k+1UL,j) );
7012 xmm1 +=
set( A(i ,k ) ) * b1;
7013 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7014 xmm3 +=
set( A(i+2UL,k ) ) * b1;
7015 xmm4 +=
set( A(i ,k+1UL) ) * b2;
7016 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
7017 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
7020 for( ; k<kend; ++k ) {
7021 const SIMDType b1( B.load(k,j) );
7022 xmm1 +=
set( A(i ,k) ) * b1;
7023 xmm2 +=
set( A(i+1UL,k) ) * b1;
7024 xmm3 +=
set( A(i+2UL,k) ) * b1;
7027 C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
7028 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
7029 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
7032 for( ; (i+2UL) <= iend; i+=2UL )
7034 const size_t kbegin( ( IsUpper_v<MT4> )
7035 ?( ( IsLower_v<MT5> )
7036 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7037 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7038 :( IsLower_v<MT5> ? j : 0UL ) );
7039 const size_t kend( ( IsLower_v<MT4> )
7040 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7043 SIMDType xmm1, xmm2, xmm3, xmm4;
7046 for( ; (k+2UL) <= kend; k+=2UL ) {
7047 const SIMDType b1( B.load(k ,j) );
7048 const SIMDType b2( B.load(k+1UL,j) );
7049 xmm1 +=
set( A(i ,k ) ) * b1;
7050 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7051 xmm3 +=
set( A(i ,k+1UL) ) * b2;
7052 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
7055 for( ; k<kend; ++k ) {
7056 const SIMDType b1( B.load(k,j) );
7057 xmm1 +=
set( A(i ,k) ) * b1;
7058 xmm2 +=
set( A(i+1UL,k) ) * b1;
7061 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7062 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
7067 const size_t kbegin( ( IsUpper_v<MT4> )
7068 ?( ( IsLower_v<MT5> )
7069 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7070 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7071 :( IsLower_v<MT5> ? j : 0UL ) );
7073 SIMDType xmm1, xmm2;
7076 for( ; (k+2UL) <= K; k+=2UL ) {
7077 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
7078 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
7082 xmm1 +=
set( A(i,k) ) * B.load(k,j);
7085 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7089 for( ; remainder && j<N; ++j )
7091 const size_t iend( UPP ? j+1UL : M );
7092 size_t i( LOW ? j : 0UL );
7094 for( ; (i+2UL) <= iend; i+=2UL )
7096 const size_t kbegin( ( IsUpper_v<MT4> )
7097 ?( ( IsLower_v<MT5> )
7098 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7099 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7100 :( IsLower_v<MT5> ? j : 0UL ) );
7101 const size_t kend( ( IsLower_v<MT4> )
7102 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7108 for(
size_t k=kbegin; k<kend; ++k ) {
7109 value1 += A(i ,k) * B(k,j);
7110 value2 += A(i+1UL,k) * B(k,j);
7113 C(i ,j) += value1 * scalar;
7114 C(i+1UL,j) += value2 * scalar;
7119 const size_t kbegin( ( IsUpper_v<MT4> )
7120 ?( ( IsLower_v<MT5> )
7121 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7122 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7123 :( IsLower_v<MT5> ? j : 0UL ) );
7127 for(
size_t k=kbegin; k<K; ++k ) {
7128 value += A(i,k) * B(k,j);
7131 C(i,j) += value * scalar;
7152 template<
typename MT3
7156 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7157 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7164 const ForwardFunctor fwd;
7166 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7167 const OppositeType_t<MT4> tmp(
serial( A ) );
7168 addAssign( C, fwd( tmp * B ) * scalar );
7170 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7171 const OppositeType_t<MT5> tmp(
serial( B ) );
7172 addAssign( C, fwd( A * tmp ) * scalar );
7174 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7175 const OppositeType_t<MT4> tmp(
serial( A ) );
7176 addAssign( C, fwd( tmp * B ) * scalar );
7179 const OppositeType_t<MT5> tmp(
serial( B ) );
7180 addAssign( C, fwd( A * tmp ) * scalar );
7199 template<
typename MT3
7203 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7204 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7206 selectDefaultAddAssignKernel( C, A, B, scalar );
7225 template<
typename MT3
7229 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7230 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7233 lmmm( C, A, B, scalar, ST2(1) );
7235 ummm( C, A, B, scalar, ST2(1) );
7237 mmm( C, A, B, scalar, ST2(1) );
7255 template<
typename MT3
7259 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7260 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7262 selectLargeAddAssignKernel( C, A, B, scalar );
7267 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7281 template<
typename MT3
7285 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7286 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7288 using ET = ElementType_t<MT3>;
7290 if( IsTriangular_v<MT4> ) {
7291 ResultType_t<MT3> tmp(
serial( B ) );
7292 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7293 addAssign( C, tmp );
7295 else if( IsTriangular_v<MT5> ) {
7296 ResultType_t<MT3> tmp(
serial( A ) );
7297 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7298 addAssign( C, tmp );
7301 gemm( C, A, B,
ET(scalar),
ET(1) );
7321 template<
typename MT >
7323 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7332 const ForwardFunctor fwd;
7334 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7335 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7337 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7338 addAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
7339 else if( IsSymmetric_v<MT1> )
7340 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
7342 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
7362 template<
typename MT
7364 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7365 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7372 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7373 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7375 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7389 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7404 template<
typename MT3
7408 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7410 if( ( IsDiagonal_v<MT5> ) ||
7411 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
7412 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7413 selectSmallSubAssignKernel( C, A, B, scalar );
7415 selectBlasSubAssignKernel( C, A, B, scalar );
7433 template<
typename MT3
7437 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7438 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7441 subAssign( C, tmp );
7459 template<
typename MT3
7463 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7464 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7468 const size_t M( A.rows() );
7469 const size_t N( B.columns() );
7471 for(
size_t i=0UL; i<M; ++i )
7473 const size_t jbegin( ( IsUpper_v<MT4> )
7474 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7476 const size_t jend( ( IsLower_v<MT4> )
7477 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7481 const size_t jnum( jend - jbegin );
7482 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7484 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7485 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7486 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7489 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7509 template<
typename MT3
7513 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7514 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7518 const size_t M( A.rows() );
7519 const size_t N( B.columns() );
7521 for(
size_t i=0UL; i<M; ++i )
7523 const size_t jbegin( ( IsUpper_v<MT5> )
7524 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7526 const size_t jend( ( IsLower_v<MT5> )
7527 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7531 const size_t jnum( jend - jbegin );
7532 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7534 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7535 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7536 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7539 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7559 template<
typename MT3
7563 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7564 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7568 for(
size_t i=0UL; i<A.rows(); ++i ) {
7569 C(i,i) -= A(i,i) * B(i,i) * scalar;
7588 template<
typename MT3
7592 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7593 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7595 selectDefaultSubAssignKernel( C, A, B, scalar );
7614 template<
typename MT3
7618 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7619 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7621 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7623 const size_t M( A.rows() );
7624 const size_t N( B.columns() );
7625 const size_t K( A.columns() );
7629 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
7632 const SIMDType factor(
set( scalar ) );
7636 if( IsIntegral_v<ElementType> )
7639 for(
size_t i=0UL; i<M; ++i )
7641 const size_t kbegin( ( IsUpper_v<MT4> )
7642 ?( ( IsLower_v<MT5> )
7643 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7644 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7645 :( IsLower_v<MT5> ? j : 0UL ) );
7646 const size_t kend( ( IsLower_v<MT4> )
7647 ?( ( IsUpper_v<MT5> )
7648 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
7649 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7650 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
7652 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7654 for(
size_t k=kbegin; k<kend; ++k ) {
7655 const SIMDType a1(
set( A(i,k) ) );
7656 xmm1 += a1 * B.load(k,j );
7657 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7658 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7659 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7660 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7661 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
7662 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
7663 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
7666 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7682 for( ; (i+2UL) <= M; i+=2UL )
7684 const size_t kbegin( ( IsUpper_v<MT4> )
7685 ?( ( IsLower_v<MT5> )
7686 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7687 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7688 :( IsLower_v<MT5> ? j : 0UL ) );
7689 const size_t kend( ( IsLower_v<MT4> )
7690 ?( ( IsUpper_v<MT5> )
7691 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
7692 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7693 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
7695 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7697 for(
size_t k=kbegin; k<kend; ++k ) {
7698 const SIMDType a1(
set( A(i ,k) ) );
7699 const SIMDType a2(
set( A(i+1UL,k) ) );
7700 const SIMDType b1( B.load(k,j ) );
7701 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7702 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7703 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7704 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
7717 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7722 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
7724 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
7725 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
7726 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
7731 const size_t kbegin( ( IsUpper_v<MT4> )
7732 ?( ( IsLower_v<MT5> )
7733 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7734 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7735 :( IsLower_v<MT5> ? j : 0UL ) );
7736 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
7738 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7740 for(
size_t k=kbegin; k<kend; ++k ) {
7741 const SIMDType a1(
set( A(i,k) ) );
7742 xmm1 += a1 * B.load(k,j );
7743 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7744 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7745 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7746 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7749 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7761 for( ; (i+2UL) <= M; i+=2UL )
7763 const size_t kbegin( ( IsUpper_v<MT4> )
7764 ?( ( IsLower_v<MT5> )
7765 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7766 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7767 :( IsLower_v<MT5> ? j : 0UL ) );
7768 const size_t kend( ( IsLower_v<MT4> )
7769 ?( ( IsUpper_v<MT5> )
7770 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
7771 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7772 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
7774 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7776 for(
size_t k=kbegin; k<kend; ++k ) {
7777 const SIMDType a1(
set( A(i ,k) ) );
7778 const SIMDType a2(
set( A(i+1UL,k) ) );
7779 const SIMDType b1( B.load(k,j ) );
7780 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7781 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7782 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7793 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7797 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
7799 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
7800 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
7805 const size_t kbegin( ( IsUpper_v<MT4> )
7806 ?( ( IsLower_v<MT5> )
7807 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7808 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7809 :( IsLower_v<MT5> ? j : 0UL ) );
7810 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
7812 SIMDType xmm1, xmm2, xmm3, xmm4;
7814 for(
size_t k=kbegin; k<kend; ++k ) {
7815 const SIMDType a1(
set( A(i,k) ) );
7816 xmm1 += a1 * B.load(k,j );
7817 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7818 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7819 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7822 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7833 for( ; (i+2UL) <= M; i+=2UL )
7835 const size_t kbegin( ( IsUpper_v<MT4> )
7836 ?( ( IsLower_v<MT5> )
7837 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7838 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7839 :( IsLower_v<MT5> ? j : 0UL ) );
7840 const size_t kend( ( IsLower_v<MT4> )
7841 ?( ( IsUpper_v<MT5> )
7842 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
7843 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7844 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
7846 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7848 for(
size_t k=kbegin; k<kend; ++k ) {
7849 const SIMDType a1(
set( A(i ,k) ) );
7850 const SIMDType a2(
set( A(i+1UL,k) ) );
7851 const SIMDType b1( B.load(k,j ) );
7852 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7853 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7862 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7865 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
7867 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
7872 const size_t kbegin( ( IsUpper_v<MT4> )
7873 ?( ( IsLower_v<MT5> )
7874 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7875 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7876 :( IsLower_v<MT5> ? j : 0UL ) );
7877 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
7879 SIMDType xmm1, xmm2, xmm3;
7881 for(
size_t k=kbegin; k<kend; ++k ) {
7882 const SIMDType a1(
set( A(i,k) ) );
7883 xmm1 += a1 * B.load(k,j );
7884 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7885 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7888 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7896 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
7897 size_t i( LOW ? j : 0UL );
7899 for( ; (i+4UL) <= iend; i+=4UL )
7901 const size_t kbegin( ( IsUpper_v<MT4> )
7902 ?( ( IsLower_v<MT5> )
7903 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7904 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7905 :( IsLower_v<MT5> ? j : 0UL ) );
7906 const size_t kend( ( IsLower_v<MT4> )
7907 ?( ( IsUpper_v<MT5> )
7908 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
7909 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
7910 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7912 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7914 for(
size_t k=kbegin; k<kend; ++k ) {
7915 const SIMDType a1(
set( A(i ,k) ) );
7916 const SIMDType a2(
set( A(i+1UL,k) ) );
7917 const SIMDType a3(
set( A(i+2UL,k) ) );
7918 const SIMDType a4(
set( A(i+3UL,k) ) );
7919 const SIMDType b1( B.load(k,j ) );
7920 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
7931 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7933 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
7935 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
7937 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
7941 for( ; (i+3UL) <= iend; i+=3UL )
7943 const size_t kbegin( ( IsUpper_v<MT4> )
7944 ?( ( IsLower_v<MT5> )
7945 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7946 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7947 :( IsLower_v<MT5> ? j : 0UL ) );
7948 const size_t kend( ( IsLower_v<MT4> )
7949 ?( ( IsUpper_v<MT5> )
7950 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
7951 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7952 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7954 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7956 for(
size_t k=kbegin; k<kend; ++k ) {
7957 const SIMDType a1(
set( A(i ,k) ) );
7958 const SIMDType a2(
set( A(i+1UL,k) ) );
7959 const SIMDType a3(
set( A(i+2UL,k) ) );
7960 const SIMDType b1( B.load(k,j ) );
7961 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
7970 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7972 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
7974 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
7978 for( ; (i+2UL) <= iend; i+=2UL )
7980 const size_t kbegin( ( IsUpper_v<MT4> )
7981 ?( ( IsLower_v<MT5> )
7982 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7983 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7984 :( IsLower_v<MT5> ? j : 0UL ) );
7985 const size_t kend( ( IsLower_v<MT4> )
7986 ?( ( IsUpper_v<MT5> )
7987 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
7988 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7989 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7991 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7994 for( ; (k+2UL) <= kend; k+=2UL ) {
7995 const SIMDType a1(
set( A(i ,k ) ) );
7996 const SIMDType a2(
set( A(i+1UL,k ) ) );
7997 const SIMDType a3(
set( A(i ,k+1UL) ) );
7998 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
7999 const SIMDType b1( B.load(k ,j ) );
8000 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
8001 const SIMDType b3( B.load(k+1UL,j ) );
8002 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
8013 for( ; k<kend; ++k ) {
8014 const SIMDType a1(
set( A(i ,k) ) );
8015 const SIMDType a2(
set( A(i+1UL,k) ) );
8016 const SIMDType b1( B.load(k,j ) );
8017 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8024 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8026 C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
8027 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) - (xmm4+xmm8) * factor );
8032 const size_t kbegin( ( IsUpper_v<MT4> )
8033 ?( ( IsLower_v<MT5> )
8034 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8035 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8036 :( IsLower_v<MT5> ? j : 0UL ) );
8037 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8039 SIMDType xmm1, xmm2, xmm3, xmm4;
8042 for( ; (k+2UL) <= kend; k+=2UL ) {
8043 const SIMDType a1(
set( A(i,k ) ) );
8044 const SIMDType a2(
set( A(i,k+1UL) ) );
8045 xmm1 += a1 * B.load(k ,j );
8046 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
8047 xmm3 += a2 * B.load(k+1UL,j );
8048 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
8051 for( ; k<kend; ++k ) {
8052 const SIMDType a1(
set( A(i,k) ) );
8053 xmm1 += a1 * B.load(k,j );
8057 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8064 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
8065 size_t i( LOW ? j : 0UL );
8067 for( ; (i+4UL) <= iend; i+=4UL )
8069 const size_t kbegin( ( IsUpper_v<MT4> )
8070 ?( ( IsLower_v<MT5> )
8071 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8072 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8073 :( IsLower_v<MT5> ? j : 0UL ) );
8074 const size_t kend( ( IsLower_v<MT4> )
8075 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8078 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8081 for( ; (k+2UL) <= kend; k+=2UL ) {
8082 const SIMDType b1( B.load(k ,j) );
8083 const SIMDType b2( B.load(k+1UL,j) );
8084 xmm1 +=
set( A(i ,k ) ) * b1;
8085 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8086 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8087 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8088 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8089 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8090 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8091 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8094 for( ; k<kend; ++k ) {
8095 const SIMDType b1( B.load(k,j) );
8096 xmm1 +=
set( A(i ,k) ) * b1;
8097 xmm2 +=
set( A(i+1UL,k) ) * b1;
8098 xmm3 +=
set( A(i+2UL,k) ) * b1;
8099 xmm4 +=
set( A(i+3UL,k) ) * b1;
8102 C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
8103 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
8104 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
8105 C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
8108 for( ; (i+3UL) <= iend; i+=3UL )
8110 const size_t kbegin( ( IsUpper_v<MT4> )
8111 ?( ( IsLower_v<MT5> )
8112 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8113 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8114 :( IsLower_v<MT5> ? j : 0UL ) );
8115 const size_t kend( ( IsLower_v<MT4> )
8116 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8119 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8122 for( ; (k+2UL) <= kend; k+=2UL ) {
8123 const SIMDType b1( B.load(k ,j) );
8124 const SIMDType b2( B.load(k+1UL,j) );
8125 xmm1 +=
set( A(i ,k ) ) * b1;
8126 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8127 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8128 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8129 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8130 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8133 for( ; k<kend; ++k ) {
8134 const SIMDType b1( B.load(k,j) );
8135 xmm1 +=
set( A(i ,k) ) * b1;
8136 xmm2 +=
set( A(i+1UL,k) ) * b1;
8137 xmm3 +=
set( A(i+2UL,k) ) * b1;
8140 C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
8141 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
8142 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
8145 for( ; (i+2UL) <= iend; i+=2UL )
8147 const size_t kbegin( ( IsUpper_v<MT4> )
8148 ?( ( IsLower_v<MT5> )
8149 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8150 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8151 :( IsLower_v<MT5> ? j : 0UL ) );
8152 const size_t kend( ( IsLower_v<MT4> )
8153 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8156 SIMDType xmm1, xmm2, xmm3, xmm4;
8159 for( ; (k+2UL) <= kend; k+=2UL ) {
8160 const SIMDType b1( B.load(k ,j) );
8161 const SIMDType b2( B.load(k+1UL,j) );
8162 xmm1 +=
set( A(i ,k ) ) * b1;
8163 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8164 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8165 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8168 for( ; k<kend; ++k ) {
8169 const SIMDType b1( B.load(k,j) );
8170 xmm1 +=
set( A(i ,k) ) * b1;
8171 xmm2 +=
set( A(i+1UL,k) ) * b1;
8174 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8175 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
8180 const size_t kbegin( ( IsUpper_v<MT4> )
8181 ?( ( IsLower_v<MT5> )
8182 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8183 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8184 :( IsLower_v<MT5> ? j : 0UL ) );
8186 SIMDType xmm1, xmm2;
8189 for( ; (k+2UL) <= K; k+=2UL ) {
8190 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8191 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8195 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8198 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8202 for( ; remainder && j<N; ++j )
8204 const size_t iend( UPP ? j+1UL : M );
8205 size_t i( LOW ? j : 0UL );
8207 for( ; (i+2UL) <= iend; i+=2UL )
8209 const size_t kbegin( ( IsUpper_v<MT4> )
8210 ?( ( IsLower_v<MT5> )
8211 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8212 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8213 :( IsLower_v<MT5> ? j : 0UL ) );
8214 const size_t kend( ( IsLower_v<MT4> )
8215 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8221 for(
size_t k=kbegin; k<kend; ++k ) {
8222 value1 += A(i ,k) * B(k,j);
8223 value2 += A(i+1UL,k) * B(k,j);
8226 C(i ,j) -= value1 * scalar;
8227 C(i+1UL,j) -= value2 * scalar;
8232 const size_t kbegin( ( IsUpper_v<MT4> )
8233 ?( ( IsLower_v<MT5> )
8234 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8235 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8236 :( IsLower_v<MT5> ? j : 0UL ) );
8240 for(
size_t k=kbegin; k<K; ++k ) {
8241 value += A(i,k) * B(k,j);
8244 C(i,j) -= value * scalar;
8265 template<
typename MT3
8269 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8270 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8277 const ForwardFunctor fwd;
8279 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8280 const OppositeType_t<MT4> tmp(
serial( A ) );
8281 subAssign( C, fwd( tmp * B ) * scalar );
8283 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8284 const OppositeType_t<MT5> tmp(
serial( B ) );
8285 subAssign( C, fwd( A * tmp ) * scalar );
8287 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8288 const OppositeType_t<MT4> tmp(
serial( A ) );
8289 subAssign( C, fwd( tmp * B ) * scalar );
8292 const OppositeType_t<MT5> tmp(
serial( B ) );
8293 subAssign( C, fwd( A * tmp ) * scalar );
8312 template<
typename MT3
8316 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8317 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8319 selectDefaultSubAssignKernel( C, A, B, scalar );
8338 template<
typename MT3
8342 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8343 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8346 lmmm( C, A, B, -scalar, ST2(1) );
8348 ummm( C, A, B, -scalar, ST2(1) );
8350 mmm( C, A, B, -scalar, ST2(1) );
8368 template<
typename MT3
8372 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8373 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8375 selectLargeSubAssignKernel( C, A, B, scalar );
8380 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8394 template<
typename MT3
8398 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8399 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8401 using ET = ElementType_t<MT3>;
8403 if( IsTriangular_v<MT4> ) {
8404 ResultType_t<MT3> tmp(
serial( B ) );
8405 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8406 subAssign( C, tmp );
8408 else if( IsTriangular_v<MT5> ) {
8409 ResultType_t<MT3> tmp(
serial( A ) );
8410 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8411 subAssign( C, tmp );
8414 gemm( C, A, B,
ET(-scalar),
ET(1) );
8434 template<
typename MT >
8436 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8445 const ForwardFunctor fwd;
8447 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8448 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8450 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8451 subAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
8452 else if( IsSymmetric_v<MT1> )
8453 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
8455 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
8475 template<
typename MT
8477 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8489 schurAssign( ~lhs, tmp );
8520 template<
typename MT
8523 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8530 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8531 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8533 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8536 else if( left.columns() == 0UL ) {
8570 template<
typename MT
8573 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8577 using TmpType = If_t< SO, OppositeType, ResultType >;
8589 const ForwardFunctor fwd;
8591 const TmpType tmp( rhs );
8610 template<
typename MT >
8612 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8621 const ForwardFunctor fwd;
8623 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8624 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8626 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8628 else if( IsSymmetric_v<MT1> )
8650 template<
typename MT
8653 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8660 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8661 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8663 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8695 template<
typename MT >
8697 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8706 const ForwardFunctor fwd;
8708 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8709 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8711 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8713 else if( IsSymmetric_v<MT1> )
8739 template<
typename MT
8742 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8749 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8750 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8752 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8784 template<
typename MT >
8786 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8795 const ForwardFunctor fwd;
8797 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8798 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8800 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8802 else if( IsSymmetric_v<MT1> )
8825 template<
typename MT
8905 template<
typename MT1
8907 inline decltype(
auto)
8953 template<
typename MT1
8959 inline decltype(
auto)
declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8967 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8968 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8997 template<
typename MT1
9003 inline decltype(
auto)
declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9011 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9012 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9041 template<
typename MT1
9047 inline decltype(
auto)
decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9055 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9056 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9085 template<
typename MT1
9091 inline decltype(
auto)
declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9099 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9100 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9129 template<
typename MT1
9135 inline decltype(
auto)
decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9143 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9144 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9160 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9161 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9162 :
public Size<MT1,0UL>
9165 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9166 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9167 :
public Size<MT2,1UL>
9183 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9184 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9185 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:332
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:422
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:292
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:317
Header file for basic type definitions.
Header file for the SparseVector base class.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
Header file for the declherm trait.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:173
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:310
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:289
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:466
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
Header file for the DenseVector base class.
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:177
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:163
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:152
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:442
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
Header file for the IsComplexDouble type trait.
Constraint on the data type.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:168
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:302
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:179
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:486
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:305
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:347
Header file for the IsAligned type trait.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:291
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:323
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:396
Header file for run time assertion macros.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:290
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:178
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:286
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:299
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:293
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:454
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:432
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:476
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:296
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:412
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:180
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:499
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the TrueType type/value trait base class.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.