35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 137 template<
typename MT1
144 :
public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
159 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
164 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
168 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
169 static constexpr
bool HERM = ( HF && !( LF || UF ) );
170 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
171 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
180 template<
typename T1,
typename T2,
typename T3 >
190 template<
typename T1,
typename T2,
typename T3 >
191 static constexpr
bool UseBlasKernel_v =
194 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
195 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
196 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
197 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
198 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
199 IsBLASCompatible_v< ElementType_t<T1> > &&
200 IsBLASCompatible_v< ElementType_t<T2> > &&
201 IsBLASCompatible_v< ElementType_t<T3> > &&
212 template<
typename T1,
typename T2,
typename T3 >
213 static constexpr
bool UseVectorizedDefaultKernel_v =
214 ( useOptimizedKernels &&
215 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
216 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
217 IsSIMDCombinable_v< ElementType_t<T1>
288 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
289 MT1::simdEnabled && MT2::simdEnabled &&
290 HasSIMDAdd_v<ET1,ET2> &&
291 HasSIMDMult_v<ET1,ET2> );
328 if( IsDiagonal_v<MT1> ) {
331 else if( IsDiagonal_v<MT2> ) {
334 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
335 const size_t begin( ( IsUpper_v<MT1> )
336 ?( ( IsLower_v<MT2> )
337 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
338 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
339 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
340 :( ( IsLower_v<MT2> )
341 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
343 const size_t end( ( IsLower_v<MT1> )
344 ?( ( IsUpper_v<MT2> )
345 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
346 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
347 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
348 :( ( IsUpper_v<MT2> )
349 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
350 :(
lhs_.columns() ) ) );
374 if( i >=
lhs_.rows() ) {
377 if( j >=
rhs_.columns() ) {
389 inline size_t rows() const noexcept {
400 return rhs_.columns();
430 template<
typename T >
431 inline bool canAlias(
const T* alias )
const noexcept {
432 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
442 template<
typename T >
443 inline bool isAliased(
const T* alias )
const noexcept {
444 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
454 return lhs_.isAligned() &&
rhs_.isAligned();
465 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
467 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
468 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
469 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
492 template<
typename MT
501 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
504 else if( rhs.
lhs_.columns() == 0UL ) {
519 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
535 template<
typename MT3
538 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
540 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
541 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
542 selectSmallAssignKernel( C, A, B );
544 selectBlasAssignKernel( C, A, B );
563 template<
typename MT3
566 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
567 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
569 const size_t M( A.rows() );
570 const size_t N( B.columns() );
571 const size_t K( A.columns() );
575 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
576 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
578 const size_t iend( ( IsStrictlyUpper_v<MT4> )
579 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
583 for(
size_t i=0UL; i<ibegin; ++i ) {
584 for(
size_t j=0UL; j<N; ++j ) {
588 for(
size_t i=ibegin; i<iend; ++i )
590 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
591 ?( ( IsStrictlyUpper_v<MT4> )
592 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
593 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
594 :( ( IsStrictlyUpper_v<MT5> )
597 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
598 ?( ( IsStrictlyLower_v<MT4> )
599 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
600 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
601 :( ( IsStrictlyLower_v<MT5> )
602 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
603 :(
LOW ? i+1UL : N ) ) );
606 for(
size_t j=0UL; j<N; ++j ) {
614 for(
size_t j=(
SYM ||
HERM ? i : 0UL ); j<jbegin; ++j ) {
617 for(
size_t j=jbegin; j<jend; ++j )
619 const size_t kbegin( ( IsUpper_v<MT4> )
620 ?( ( IsLower_v<MT5> )
621 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
622 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
623 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
624 :( ( IsLower_v<MT5> )
625 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
627 const size_t kend( ( IsLower_v<MT4> )
628 ?( ( IsUpper_v<MT5> )
629 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
630 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
631 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
632 :( ( IsUpper_v<MT5> )
633 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
638 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
639 C(i,j) += A(i,k) * B(k,j);
642 for(
size_t j=jend; j<N; ++j ) {
646 for(
size_t i=iend; i<M; ++i ) {
647 for(
size_t j=0UL; j<N; ++j ) {
653 for(
size_t i=1UL; i<M; ++i ) {
654 for(
size_t j=0UL; j<i; ++j ) {
655 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
677 template<
typename MT3
680 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
681 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
683 const size_t M( A.rows() );
684 const size_t N( B.columns() );
685 const size_t K( A.columns() );
689 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
690 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
692 const size_t jend( ( IsStrictlyLower_v<MT5> )
693 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
697 for(
size_t j=0UL; j<jbegin; ++j ) {
698 for(
size_t i=0UL; i<M; ++i ) {
702 for(
size_t j=jbegin; j<jend; ++j )
704 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
705 ?( ( IsStrictlyLower_v<MT4> )
706 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
707 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
708 :( ( IsStrictlyLower_v<MT4> )
711 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
712 ?( ( IsStrictlyUpper_v<MT4> )
713 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
714 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
715 :( ( IsStrictlyUpper_v<MT4> )
716 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
717 :(
UPP ? j+1UL : M ) ) );
720 for(
size_t i=0UL; i<M; ++i ) {
728 for(
size_t i=(
SYM ||
HERM ? j : 0UL ); i<ibegin; ++i ) {
731 for(
size_t i=ibegin; i<iend; ++i )
733 const size_t kbegin( ( IsUpper_v<MT4> )
734 ?( ( IsLower_v<MT5> )
735 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
736 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
737 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
738 :( ( IsLower_v<MT5> )
739 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
741 const size_t kend( ( IsLower_v<MT4> )
742 ?( ( IsUpper_v<MT5> )
743 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
744 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
745 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
746 :( ( IsUpper_v<MT5> )
747 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
751 C(i,j) = A(i,kbegin) * B(kbegin,j);
752 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
753 C(i,j) += A(i,k) * B(k,j);
756 for(
size_t i=iend; i<M; ++i ) {
760 for(
size_t j=jend; j<N; ++j ) {
761 for(
size_t i=0UL; i<M; ++i ) {
767 for(
size_t j=1UL; j<N; ++j ) {
768 for(
size_t i=0UL; i<j; ++i ) {
769 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
791 template<
typename MT3
794 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797 const size_t M( A.rows() );
798 const size_t N( B.columns() );
800 for(
size_t i=0UL; i<M; ++i )
802 const size_t jbegin( ( IsUpper_v<MT4> )
803 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
805 const size_t jend( ( IsLower_v<MT4> )
806 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
810 if( IsUpper_v<MT4> ) {
811 for(
size_t j=0UL; j<jbegin; ++j ) {
815 for(
size_t j=jbegin; j<jend; ++j ) {
816 C(i,j) = A(i,j) * B(j,j);
818 if( IsLower_v<MT4> ) {
819 for(
size_t j=jend; j<N; ++j ) {
842 template<
typename MT3
845 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
846 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
848 constexpr
size_t block( BLOCK_SIZE );
850 const size_t M( A.rows() );
851 const size_t N( B.columns() );
853 for(
size_t jj=0UL; jj<N; jj+=block ) {
854 const size_t jend(
min( N, jj+block ) );
855 for(
size_t ii=0UL; ii<M; ii+=block ) {
856 const size_t iend(
min( M, ii+block ) );
857 for(
size_t j=jj; j<jend; ++j )
859 const size_t ibegin( ( IsLower_v<MT4> )
860 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
862 const size_t ipos( ( IsUpper_v<MT4> )
863 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
866 if( IsLower_v<MT4> ) {
867 for(
size_t i=ii; i<ibegin; ++i ) {
871 for(
size_t i=ibegin; i<ipos; ++i ) {
872 C(i,j) = A(i,j) * B(j,j);
874 if( IsUpper_v<MT4> ) {
875 for(
size_t i=ipos; i<iend; ++i ) {
900 template<
typename MT3
903 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906 constexpr
size_t block( BLOCK_SIZE );
908 const size_t M( A.rows() );
909 const size_t N( B.columns() );
911 for(
size_t ii=0UL; ii<M; ii+=block ) {
912 const size_t iend(
min( M, ii+block ) );
913 for(
size_t jj=0UL; jj<N; jj+=block ) {
914 const size_t jend(
min( N, jj+block ) );
915 for(
size_t i=ii; i<iend; ++i )
917 const size_t jbegin( ( IsUpper_v<MT5> )
918 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
920 const size_t jpos( ( IsLower_v<MT5> )
921 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
924 if( IsUpper_v<MT5> ) {
925 for(
size_t j=jj; j<jbegin; ++j ) {
929 for(
size_t j=jbegin; j<jpos; ++j ) {
930 C(i,j) = A(i,i) * B(i,j);
932 if( IsLower_v<MT5> ) {
933 for(
size_t j=jpos; j<jend; ++j ) {
958 template<
typename MT3
961 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
962 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
964 const size_t M( A.rows() );
965 const size_t N( B.columns() );
967 for(
size_t j=0UL; j<N; ++j )
969 const size_t ibegin( ( IsLower_v<MT5> )
970 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
972 const size_t iend( ( IsUpper_v<MT5> )
973 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
977 if( IsLower_v<MT5> ) {
978 for(
size_t i=0UL; i<ibegin; ++i ) {
982 for(
size_t i=ibegin; i<iend; ++i ) {
983 C(i,j) = A(i,i) * B(i,j);
985 if( IsUpper_v<MT5> ) {
986 for(
size_t i=iend; i<M; ++i ) {
1009 template<
typename MT3
1012 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1017 for(
size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1038 template<
typename MT3
1041 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044 selectDefaultAssignKernel( C, A, B );
1064 template<
typename MT3
1067 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1080 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1082 const size_t jend(
LOW ? i+2UL : N );
1087 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1088 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1094 reset( C(i+1UL,j) );
1098 for( ; (j+4UL) <= jend; j+=4UL )
1100 const size_t kbegin( ( IsUpper_v<MT4> )
1101 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1102 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1103 const size_t kend( ( IsLower_v<MT4> )
1104 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1105 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1107 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1110 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1115 const SIMDType a2( A.load(i+1UL,k) );
1117 const SIMDType b2( B.load(k,j+1UL) );
1118 const SIMDType b3( B.load(k,j+2UL) );
1119 const SIMDType b4( B.load(k,j+3UL) );
1130 C(i ,j ) =
sum( xmm1 );
1131 C(i ,j+1UL) =
sum( xmm2 );
1132 C(i ,j+2UL) =
sum( xmm3 );
1133 C(i ,j+3UL) =
sum( xmm4 );
1134 C(i+1UL,j ) =
sum( xmm5 );
1135 C(i+1UL,j+1UL) =
sum( xmm6 );
1136 C(i+1UL,j+2UL) =
sum( xmm7 );
1137 C(i+1UL,j+3UL) =
sum( xmm8 );
1139 for( ; remainder && k<kend; ++k ) {
1140 C(i ,j ) += A(i ,k) * B(k,j );
1141 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1142 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1143 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1144 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1145 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1146 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1147 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1151 for( ; (j+2UL) <= jend; j+=2UL )
1153 const size_t kbegin( ( IsUpper_v<MT4> )
1154 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1155 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1156 const size_t kend( ( IsLower_v<MT4> )
1157 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1158 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1160 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1168 const SIMDType a2( A.load(i+1UL,k) );
1170 const SIMDType b2( B.load(k,j+1UL) );
1177 C(i ,j ) =
sum( xmm1 );
1178 C(i ,j+1UL) =
sum( xmm2 );
1179 C(i+1UL,j ) =
sum( xmm3 );
1180 C(i+1UL,j+1UL) =
sum( xmm4 );
1182 for( ; remainder && k<kend; ++k ) {
1183 C(i ,j ) += A(i ,k) * B(k,j );
1184 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1185 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1186 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1192 const size_t kbegin( ( IsUpper_v<MT4> )
1193 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1194 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1195 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1197 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1205 xmm1 += A.load(i ,k) * b1;
1206 xmm2 += A.load(i+1UL,k) * b1;
1209 C(i ,j) =
sum( xmm1 );
1210 C(i+1UL,j) =
sum( xmm2 );
1212 for( ; remainder && k<kend; ++k ) {
1213 C(i ,j) += A(i ,k) * B(k,j);
1214 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1223 reset( C(i+1UL,j) );
1230 const size_t jend(
LOW ? i+1UL : N );
1235 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1244 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
1246 const size_t kbegin( ( IsUpper_v<MT4> )
1247 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1248 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1249 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1251 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1259 xmm1 += a1 * B.load(k,j );
1260 xmm2 += a1 * B.load(k,j+1UL);
1261 xmm3 += a1 * B.load(k,j+2UL);
1262 xmm4 += a1 * B.load(k,j+3UL);
1265 C(i,j ) =
sum( xmm1 );
1266 C(i,j+1UL) =
sum( xmm2 );
1267 C(i,j+2UL) =
sum( xmm3 );
1268 C(i,j+3UL) =
sum( xmm4 );
1270 for( ; remainder && k<kend; ++k ) {
1271 C(i,j ) += A(i,k) * B(k,j );
1272 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1273 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1274 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1278 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1280 const size_t kbegin( ( IsUpper_v<MT4> )
1281 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1282 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1283 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1285 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1293 xmm1 += a1 * B.load(k,j );
1294 xmm2 += a1 * B.load(k,j+1UL);
1297 C(i,j ) =
sum( xmm1 );
1298 C(i,j+1UL) =
sum( xmm2 );
1300 for( ; remainder && k<kend; ++k ) {
1301 C(i,j ) += A(i,k) * B(k,j );
1302 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1308 const size_t kbegin( ( IsUpper_v<MT4> )
1309 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1310 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1312 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
1319 xmm1 += A.load(i,k) * B.load(k,j);
1322 C(i,j) =
sum( xmm1 );
1324 for( ; remainder && k<K; ++k ) {
1325 C(i,j) += A(i,k) * B(k,j);
1356 template<
typename MT3
1359 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1360 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1362 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1364 const size_t M( A.rows() );
1365 const size_t N( B.columns() );
1366 const size_t K( A.columns() );
1372 for( ; !(
LOW &&
UPP ) && (i+4UL) <= M; i+=4UL )
1374 const size_t jend(
LOW ? i+4UL : N );
1379 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1380 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1381 C(i+2UL,j) =
HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
1382 C(i+3UL,j) =
HERM ?
conj( C(j,i+3UL) ) : C(j,i+3UL);
1388 reset( C(i+1UL,j) );
1389 reset( C(i+2UL,j) );
1390 reset( C(i+3UL,j) );
1394 for( ; (j+2UL) <= jend; j+=2UL )
1396 const size_t kbegin( ( IsUpper_v<MT4> )
1397 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1398 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1399 const size_t kend( ( IsLower_v<MT4> )
1400 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1401 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1403 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1406 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1411 const SIMDType a2( A.load(i+1UL,k) );
1412 const SIMDType a3( A.load(i+2UL,k) );
1413 const SIMDType a4( A.load(i+3UL,k) );
1415 const SIMDType b2( B.load(k,j+1UL) );
1426 C(i ,j ) =
sum( xmm1 );
1427 C(i ,j+1UL) =
sum( xmm2 );
1428 C(i+1UL,j ) =
sum( xmm3 );
1429 C(i+1UL,j+1UL) =
sum( xmm4 );
1430 C(i+2UL,j ) =
sum( xmm5 );
1431 C(i+2UL,j+1UL) =
sum( xmm6 );
1432 C(i+3UL,j ) =
sum( xmm7 );
1433 C(i+3UL,j+1UL) =
sum( xmm8 );
1435 for( ; remainder && k<kend; ++k ) {
1436 C(i ,j ) += A(i ,k) * B(k,j );
1437 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1438 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1439 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1440 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1441 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1442 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1443 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1449 const size_t kbegin( ( IsUpper_v<MT4> )
1450 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1451 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1452 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
1454 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1462 xmm1 += A.load(i ,k) * b1;
1463 xmm2 += A.load(i+1UL,k) * b1;
1464 xmm3 += A.load(i+2UL,k) * b1;
1465 xmm4 += A.load(i+3UL,k) * b1;
1468 C(i ,j) =
sum( xmm1 );
1469 C(i+1UL,j) =
sum( xmm2 );
1470 C(i+2UL,j) =
sum( xmm3 );
1471 C(i+3UL,j) =
sum( xmm4 );
1473 for( ; remainder && k<kend; ++k ) {
1474 C(i ,j) += A(i ,k) * B(k,j);
1475 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1476 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1477 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
1486 reset( C(i+1UL,j) );
1487 reset( C(i+2UL,j) );
1488 reset( C(i+3UL,j) );
1493 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1495 const size_t jend(
LOW ? i+2UL : N );
1500 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1501 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1507 reset( C(i+1UL,j) );
1511 for( ; (j+2UL) <= jend; j+=2UL )
1513 const size_t kbegin( ( IsUpper_v<MT4> )
1514 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1515 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1516 const size_t kend( ( IsLower_v<MT4> )
1517 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1518 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1520 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1528 const SIMDType a2( A.load(i+1UL,k) );
1530 const SIMDType b2( B.load(k,j+1UL) );
1537 C(i ,j ) =
sum( xmm1 );
1538 C(i ,j+1UL) =
sum( xmm2 );
1539 C(i+1UL,j ) =
sum( xmm3 );
1540 C(i+1UL,j+1UL) =
sum( xmm4 );
1542 for( ; remainder && k<kend; ++k ) {
1543 C(i ,j ) += A(i ,k) * B(k,j );
1544 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1545 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1546 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1552 const size_t kbegin( ( IsUpper_v<MT4> )
1553 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1554 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1555 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1557 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1565 xmm1 += A.load(i ,k) * b1;
1566 xmm2 += A.load(i+1UL,k) * b1;
1569 C(i ,j) =
sum( xmm1 );
1570 C(i+1UL,j) =
sum( xmm2 );
1572 for( ; remainder && k<kend; ++k ) {
1573 C(i ,j) += A(i ,k) * B(k,j);
1574 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1583 reset( C(i+1UL,j) );
1590 const size_t jend(
LOW ? i+1UL : N );
1595 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1604 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1606 const size_t kbegin( ( IsUpper_v<MT4> )
1607 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1608 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1609 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1611 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1619 xmm1 += a1 * B.load(k,j );
1620 xmm2 += a1 * B.load(k,j+1UL);
1623 C(i,j ) =
sum( xmm1 );
1624 C(i,j+1UL) =
sum( xmm2 );
1626 for( ; remainder && k<kend; ++k ) {
1627 C(i,j ) += A(i,k) * B(k,j );
1628 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1634 const size_t kbegin( ( IsUpper_v<MT4> )
1635 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1636 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1638 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
1645 xmm1 += A.load(i,k) * B.load(k,j);
1648 C(i,j) =
sum( xmm1 );
1650 for( ; remainder && k<K; ++k ) {
1651 C(i,j) += A(i,k) * B(k,j);
1681 template<
typename MT3
1684 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1685 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1687 selectDefaultAssignKernel( C, A, B );
1707 template<
typename MT3
1710 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1711 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1741 template<
typename MT3
1744 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1745 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1747 selectLargeAssignKernel( C, A, B );
1753 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1767 template<
typename MT3
1770 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1771 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1773 using ET = ElementType_t<MT3>;
1775 if( IsTriangular_v<MT4> ) {
1777 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1779 else if( IsTriangular_v<MT5> ) {
1781 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1784 gemm( C, A, B, ET(1), ET(0) );
1804 template<
typename MT
1806 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1810 using TmpType = If_t< SO, OppositeType, ResultType >;
1822 const ForwardFunctor fwd;
1824 const TmpType tmp(
serial( rhs ) );
1825 assign( ~lhs, fwd( tmp ) );
1843 template<
typename MT
1845 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1852 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1866 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1882 template<
typename MT3
1885 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1887 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
1888 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1889 selectSmallAddAssignKernel( C, A, B );
1891 selectBlasAddAssignKernel( C, A, B );
1910 template<
typename MT3
1913 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1914 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1916 const size_t M( A.rows() );
1917 const size_t N( B.columns() );
1918 const size_t K( A.columns() );
1922 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
1923 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
1925 const size_t iend( ( IsStrictlyUpper_v<MT4> )
1926 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
1930 for(
size_t i=ibegin; i<iend; ++i )
1932 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1933 ?( ( IsStrictlyUpper_v<MT4> )
1934 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
1935 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
1936 :( ( IsStrictlyUpper_v<MT5> )
1937 ?(
UPP ?
max( i, 1UL ) : 1UL )
1938 :(
UPP ? i : 0UL ) ) );
1939 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
1940 ?( ( IsStrictlyLower_v<MT4> )
1941 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
1942 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
1943 :( ( IsStrictlyLower_v<MT5> )
1944 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
1945 :(
LOW ? i+1UL : N ) ) );
1947 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
1950 for(
size_t j=jbegin; j<jend; ++j )
1952 const size_t kbegin( ( IsUpper_v<MT4> )
1953 ?( ( IsLower_v<MT5> )
1954 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1955 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1956 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1957 :( ( IsLower_v<MT5> )
1958 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1960 const size_t kend( ( IsLower_v<MT4> )
1961 ?( ( IsUpper_v<MT5> )
1962 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
1963 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1964 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1965 :( ( IsUpper_v<MT5> )
1966 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1970 const size_t knum( kend - kbegin );
1971 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1973 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1974 C(i,j) += A(i,k ) * B(k ,j);
1975 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
1978 C(i,j) += A(i,kpos) * B(kpos,j);
2000 template<
typename MT3
2003 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2004 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2006 const size_t M( A.rows() );
2007 const size_t N( B.columns() );
2008 const size_t K( A.columns() );
2012 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
2013 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
2015 const size_t jend( ( IsStrictlyLower_v<MT5> )
2016 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
2020 for(
size_t j=jbegin; j<jend; ++j )
2022 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
2023 ?( ( IsStrictlyLower_v<MT4> )
2024 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
2025 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2026 :( ( IsStrictlyLower_v<MT4> )
2027 ?(
LOW ?
max( j, 1UL ) : 1UL )
2028 :(
LOW ? j : 0UL ) ) );
2029 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
2030 ?( ( IsStrictlyUpper_v<MT4> )
2031 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
2032 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
2033 :( ( IsStrictlyUpper_v<MT4> )
2034 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
2035 :(
UPP ? j+1UL : M ) ) );
2037 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
2040 for(
size_t i=ibegin; i<iend; ++i )
2042 const size_t kbegin( ( IsUpper_v<MT4> )
2043 ?( ( IsLower_v<MT5> )
2044 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2045 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2046 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2047 :( ( IsLower_v<MT5> )
2048 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2050 const size_t kend( ( IsLower_v<MT4> )
2051 ?( ( IsUpper_v<MT5> )
2052 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2053 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2054 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2055 :( ( IsUpper_v<MT5> )
2056 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2060 const size_t knum( kend - kbegin );
2061 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2063 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2064 C(i,j) += A(i,k ) * B(k ,j);
2065 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2068 C(i,j) += A(i,kpos) * B(kpos,j);
2090 template<
typename MT3
2093 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2094 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2096 const size_t M( A.rows() );
2097 const size_t N( B.columns() );
2099 for(
size_t i=0UL; i<M; ++i )
2101 const size_t jbegin( ( IsUpper_v<MT4> )
2102 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2104 const size_t jend( ( IsLower_v<MT4> )
2105 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2109 const size_t jnum( jend - jbegin );
2110 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2112 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2113 C(i,j ) += A(i,j ) * B(j ,j );
2114 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2117 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2138 template<
typename MT3
2141 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2142 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2144 constexpr
size_t block( BLOCK_SIZE );
2146 const size_t M( A.rows() );
2147 const size_t N( B.columns() );
2149 for(
size_t jj=0UL; jj<N; jj+=block ) {
2150 const size_t jend(
min( N, jj+block ) );
2151 for(
size_t ii=0UL; ii<M; ii+=block ) {
2152 const size_t iend(
min( M, ii+block ) );
2153 for(
size_t j=jj; j<jend; ++j )
2155 const size_t ibegin( ( IsLower_v<MT4> )
2156 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
2158 const size_t ipos( ( IsUpper_v<MT4> )
2159 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
2162 for(
size_t i=ibegin; i<ipos; ++i ) {
2163 C(i,j) += A(i,j) * B(j,j);
2186 template<
typename MT3
2189 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2190 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2192 constexpr
size_t block( BLOCK_SIZE );
2194 const size_t M( A.rows() );
2195 const size_t N( B.columns() );
2197 for(
size_t ii=0UL; ii<M; ii+=block ) {
2198 const size_t iend(
min( M, ii+block ) );
2199 for(
size_t jj=0UL; jj<N; jj+=block ) {
2200 const size_t jend(
min( N, jj+block ) );
2201 for(
size_t i=ii; i<iend; ++i )
2203 const size_t jbegin( ( IsUpper_v<MT5> )
2204 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
2206 const size_t jpos( ( IsLower_v<MT5> )
2207 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
2210 for(
size_t j=jbegin; j<jpos; ++j ) {
2211 C(i,j) += A(i,i) * B(i,j);
2234 template<
typename MT3
2237 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2238 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2240 const size_t M( A.rows() );
2241 const size_t N( B.columns() );
2243 for(
size_t j=0UL; j<N; ++j )
2245 const size_t ibegin( ( IsLower_v<MT5> )
2246 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2248 const size_t iend( ( IsUpper_v<MT5> )
2249 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2253 const size_t inum( iend - ibegin );
2254 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2256 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2257 C(i ,j) += A(i ,i ) * B(i ,j);
2258 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2261 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2282 template<
typename MT3
2285 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2286 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2288 for(
size_t i=0UL; i<A.rows(); ++i ) {
2289 C(i,i) += A(i,i) * B(i,i);
2309 template<
typename MT3
2312 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2313 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2315 selectDefaultAddAssignKernel( C, A, B );
2335 template<
typename MT3
2338 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2339 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2341 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2343 const size_t M( A.rows() );
2344 const size_t N( B.columns() );
2345 const size_t K( A.columns() );
2351 for( ; (i+2UL) <= M; i+=2UL )
2353 const size_t jend(
LOW ? i+2UL : N );
2354 size_t j(
UPP ? i : 0UL );
2356 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2358 const size_t kbegin( ( IsUpper_v<MT4> )
2359 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2360 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2361 const size_t kend( ( IsLower_v<MT4> )
2362 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
2363 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
2365 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2368 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2373 const SIMDType a2( A.load(i+1UL,k) );
2375 const SIMDType b2( B.load(k,j+1UL) );
2376 const SIMDType b3( B.load(k,j+2UL) );
2377 const SIMDType b4( B.load(k,j+3UL) );
2388 C(i ,j ) +=
sum( xmm1 );
2389 C(i ,j+1UL) +=
sum( xmm2 );
2390 C(i ,j+2UL) +=
sum( xmm3 );
2391 C(i ,j+3UL) +=
sum( xmm4 );
2392 C(i+1UL,j ) +=
sum( xmm5 );
2393 C(i+1UL,j+1UL) +=
sum( xmm6 );
2394 C(i+1UL,j+2UL) +=
sum( xmm7 );
2395 C(i+1UL,j+3UL) +=
sum( xmm8 );
2397 for( ; remainder && k<kend; ++k ) {
2398 C(i ,j ) += A(i ,k) * B(k,j );
2399 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2400 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2401 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2402 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2403 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2404 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2405 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2409 for( ; (j+2UL) <= jend; j+=2UL )
2411 const size_t kbegin( ( IsUpper_v<MT4> )
2412 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2413 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2414 const size_t kend( ( IsLower_v<MT4> )
2415 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2416 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2418 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2426 const SIMDType a2( A.load(i+1UL,k) );
2428 const SIMDType b2( B.load(k,j+1UL) );
2435 C(i ,j ) +=
sum( xmm1 );
2436 C(i ,j+1UL) +=
sum( xmm2 );
2437 C(i+1UL,j ) +=
sum( xmm3 );
2438 C(i+1UL,j+1UL) +=
sum( xmm4 );
2440 for( ; remainder && k<kend; ++k ) {
2441 C(i ,j ) += A(i ,k) * B(k,j );
2442 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2443 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2444 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2450 const size_t kbegin( ( IsUpper_v<MT4> )
2451 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2452 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2453 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2455 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2463 xmm1 += A.load(i ,k) * b1;
2464 xmm2 += A.load(i+1UL,k) * b1;
2467 C(i ,j) +=
sum( xmm1 );
2468 C(i+1UL,j) +=
sum( xmm2 );
2470 for( ; remainder && k<kend; ++k ) {
2471 C(i ,j) += A(i ,k) * B(k,j);
2472 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2479 const size_t jend(
LOW ? i+1UL : N );
2480 size_t j(
UPP ? i : 0UL );
2482 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2484 const size_t kbegin( ( IsUpper_v<MT4> )
2485 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2486 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2487 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
2489 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2497 xmm1 += a1 * B.load(k,j );
2498 xmm2 += a1 * B.load(k,j+1UL);
2499 xmm3 += a1 * B.load(k,j+2UL);
2500 xmm4 += a1 * B.load(k,j+3UL);
2503 C(i,j ) +=
sum( xmm1 );
2504 C(i,j+1UL) +=
sum( xmm2 );
2505 C(i,j+2UL) +=
sum( xmm3 );
2506 C(i,j+3UL) +=
sum( xmm4 );
2508 for( ; remainder && k<kend; ++k ) {
2509 C(i,j ) += A(i,k) * B(k,j );
2510 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2511 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
2512 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
2516 for( ; (j+2UL) <= jend; j+=2UL )
2518 const size_t kbegin( ( IsUpper_v<MT4> )
2519 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2520 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2521 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2523 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2531 xmm1 += a1 * B.load(k,j );
2532 xmm2 += a1 * B.load(k,j+1UL);
2535 C(i,j ) +=
sum( xmm1 );
2536 C(i,j+1UL) +=
sum( xmm2 );
2538 for( ; remainder && k<kend; ++k ) {
2539 C(i,j ) += A(i,k) * B(k,j );
2540 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2546 const size_t kbegin( ( IsUpper_v<MT4> )
2547 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2548 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2550 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
2557 xmm1 += A.load(i,k) * B.load(k,j);
2560 C(i,j) +=
sum( xmm1 );
2562 for( ; remainder && k<K; ++k ) {
2563 C(i,j) += A(i,k) * B(k,j);
2586 template<
typename MT3
2589 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2590 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2592 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2594 const size_t M( A.rows() );
2595 const size_t N( B.columns() );
2596 const size_t K( A.columns() );
2602 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
2606 for( ; (j+2UL) <= N; j+=2UL )
2608 const size_t kbegin( ( IsUpper_v<MT4> )
2609 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2610 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2611 const size_t kend( ( IsLower_v<MT4> )
2612 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
2613 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2615 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2618 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2623 const SIMDType a2( A.load(i+1UL,k) );
2624 const SIMDType a3( A.load(i+2UL,k) );
2625 const SIMDType a4( A.load(i+3UL,k) );
2627 const SIMDType b2( B.load(k,j+1UL) );
2638 C(i ,j ) +=
sum( xmm1 );
2639 C(i ,j+1UL) +=
sum( xmm2 );
2640 C(i+1UL,j ) +=
sum( xmm3 );
2641 C(i+1UL,j+1UL) +=
sum( xmm4 );
2642 C(i+2UL,j ) +=
sum( xmm5 );
2643 C(i+2UL,j+1UL) +=
sum( xmm6 );
2644 C(i+3UL,j ) +=
sum( xmm7 );
2645 C(i+3UL,j+1UL) +=
sum( xmm8 );
2647 for( ; remainder && k<kend; ++k ) {
2648 C(i ,j ) += A(i ,k) * B(k,j );
2649 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2650 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2651 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2652 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2653 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2654 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2655 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2661 const size_t kbegin( ( IsUpper_v<MT4> )
2662 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2663 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2664 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2666 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2674 xmm1 += A.load(i ,k) * b1;
2675 xmm2 += A.load(i+1UL,k) * b1;
2676 xmm3 += A.load(i+2UL,k) * b1;
2677 xmm4 += A.load(i+3UL,k) * b1;
2680 C(i ,j) +=
sum( xmm1 );
2681 C(i+1UL,j) +=
sum( xmm2 );
2682 C(i+2UL,j) +=
sum( xmm3 );
2683 C(i+3UL,j) +=
sum( xmm4 );
2685 for( ; remainder && k<kend; ++k ) {
2686 C(i ,j) += A(i ,k) * B(k,j);
2687 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2688 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2689 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2694 for( ; (i+2UL) <= M; i+=2UL )
2696 const size_t jend(
LOW ? i+2UL : N );
2697 size_t j(
UPP ? i : 0UL );
2699 for( ; (j+2UL) <= jend; j+=2UL )
2701 const size_t kbegin( ( IsUpper_v<MT4> )
2702 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2703 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2704 const size_t kend( ( IsLower_v<MT4> )
2705 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2706 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2708 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2716 const SIMDType a2( A.load(i+1UL,k) );
2718 const SIMDType b2( B.load(k,j+1UL) );
2725 C(i ,j ) +=
sum( xmm1 );
2726 C(i ,j+1UL) +=
sum( xmm2 );
2727 C(i+1UL,j ) +=
sum( xmm3 );
2728 C(i+1UL,j+1UL) +=
sum( xmm4 );
2730 for( ; remainder && k<kend; ++k ) {
2731 C(i ,j ) += A(i ,k) * B(k,j );
2732 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2733 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2734 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2740 const size_t kbegin( ( IsUpper_v<MT4> )
2741 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2742 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2743 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2745 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2753 xmm1 += A.load(i ,k) * b1;
2754 xmm2 += A.load(i+1UL,k) * b1;
2757 C(i ,j) +=
sum( xmm1 );
2758 C(i+1UL,j) +=
sum( xmm2 );
2760 for( ; remainder && k<kend; ++k ) {
2761 C(i ,j) += A(i ,k) * B(k,j);
2762 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2769 const size_t jend(
LOW ? i+1UL : N );
2770 size_t j(
UPP ? i : 0UL );
2772 for( ; (j+2UL) <= jend; j+=2UL )
2774 const size_t kbegin( ( IsUpper_v<MT4> )
2775 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2776 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2777 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2779 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2787 xmm1 += a1 * B.load(k,j );
2788 xmm2 += a1 * B.load(k,j+1UL);
2791 C(i,j ) +=
sum( xmm1 );
2792 C(i,j+1UL) +=
sum( xmm2 );
2794 for( ; remainder && k<kend; ++k ) {
2795 C(i,j ) += A(i,k) * B(k,j );
2796 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2802 const size_t kbegin( ( IsUpper_v<MT4> )
2803 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2804 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2806 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
2813 xmm1 += A.load(i,k) * B.load(k,j);
2816 C(i,j) +=
sum( xmm1 );
2818 for( ; remainder && k<K; ++k ) {
2819 C(i,j) += A(i,k) * B(k,j);
2841 template<
typename MT3
2844 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2845 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2847 selectDefaultAddAssignKernel( C, A, B );
2867 template<
typename MT3
2870 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2871 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2897 template<
typename MT3
2900 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2901 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2903 selectLargeAddAssignKernel( C, A, B );
2909 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2923 template<
typename MT3
2926 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2927 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2929 using ET = ElementType_t<MT3>;
2931 if( IsTriangular_v<MT4> ) {
2932 ResultType_t<MT3> tmp(
serial( B ) );
2933 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2934 addAssign( C, tmp );
2936 else if( IsTriangular_v<MT5> ) {
2937 ResultType_t<MT3> tmp(
serial( A ) );
2938 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2939 addAssign( C, tmp );
2942 gemm( C, A, B, ET(1), ET(1) );
2966 template<
typename MT
2968 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2975 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2989 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3005 template<
typename MT3
3008 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3010 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
3011 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
3012 selectSmallSubAssignKernel( C, A, B );
3014 selectBlasSubAssignKernel( C, A, B );
3033 template<
typename MT3
3036 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3037 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3039 const size_t M( A.rows() );
3040 const size_t N( B.columns() );
3041 const size_t K( A.columns() );
3045 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
3046 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
3048 const size_t iend( ( IsStrictlyUpper_v<MT4> )
3049 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
3053 for(
size_t i=ibegin; i<iend; ++i )
3055 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3056 ?( ( IsStrictlyUpper_v<MT4> )
3057 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
3058 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
3059 :( ( IsStrictlyUpper_v<MT5> )
3060 ?(
UPP ?
max( i, 1UL ) : 1UL )
3061 :(
UPP ? i : 0UL ) ) );
3062 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
3063 ?( ( IsStrictlyLower_v<MT4> )
3064 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
3065 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
3066 :( ( IsStrictlyLower_v<MT5> )
3067 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
3068 :(
LOW ? i+1UL : N ) ) );
3070 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
3073 for(
size_t j=jbegin; j<jend; ++j )
3075 const size_t kbegin( ( IsUpper_v<MT4> )
3076 ?( ( IsLower_v<MT5> )
3077 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3078 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3079 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3080 :( ( IsLower_v<MT5> )
3081 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3083 const size_t kend( ( IsLower_v<MT4> )
3084 ?( ( IsUpper_v<MT5> )
3085 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3086 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3087 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3088 :( ( IsUpper_v<MT5> )
3089 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3093 const size_t knum( kend - kbegin );
3094 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3096 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3097 C(i,j) -= A(i,k ) * B(k ,j);
3098 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3101 C(i,j) -= A(i,kpos) * B(kpos,j);
3123 template<
typename MT3
3126 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3127 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3129 const size_t M( A.rows() );
3130 const size_t N( B.columns() );
3131 const size_t K( A.columns() );
3135 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3136 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3138 const size_t jend( ( IsStrictlyLower_v<MT5> )
3139 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3143 for(
size_t j=jbegin; j<jend; ++j )
3145 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3146 ?( ( IsStrictlyLower_v<MT4> )
3147 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3148 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3149 :( ( IsStrictlyLower_v<MT4> )
3150 ?(
LOW ?
max( j, 1UL ) : 1UL )
3151 :(
LOW ? j : 0UL ) ) );
3152 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3153 ?( ( IsStrictlyUpper_v<MT4> )
3154 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3155 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3156 :( ( IsStrictlyUpper_v<MT4> )
3157 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
3158 :(
UPP ? j+1UL : M ) ) );
3160 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
3163 for(
size_t i=ibegin; i<iend; ++i )
3165 const size_t kbegin( ( IsUpper_v<MT4> )
3166 ?( ( IsLower_v<MT5> )
3167 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3168 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3169 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3170 :( ( IsLower_v<MT5> )
3171 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3173 const size_t kend( ( IsLower_v<MT4> )
3174 ?( ( IsUpper_v<MT5> )
3175 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3176 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3177 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3178 :( ( IsUpper_v<MT5> )
3179 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3183 const size_t knum( kend - kbegin );
3184 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3186 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3187 C(i,j) -= A(i,k ) * B(k ,j);
3188 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3191 C(i,j) -= A(i,kpos) * B(kpos,j);
3213 template<
typename MT3
3216 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3217 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3219 const size_t M( A.rows() );
3220 const size_t N( B.columns() );
3222 for(
size_t i=0UL; i<M; ++i )
3224 const size_t jbegin( ( IsUpper_v<MT4> )
3225 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3227 const size_t jend( ( IsLower_v<MT4> )
3228 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3232 const size_t jnum( jend - jbegin );
3233 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3235 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3236 C(i,j ) -= A(i,j ) * B(j ,j );
3237 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3240 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3261 template<
typename MT3
3264 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3265 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3267 constexpr
size_t block( BLOCK_SIZE );
3269 const size_t M( A.rows() );
3270 const size_t N( B.columns() );
3272 for(
size_t jj=0UL; jj<N; jj+=block ) {
3273 const size_t jend(
min( N, jj+block ) );
3274 for(
size_t ii=0UL; ii<M; ii+=block ) {
3275 const size_t iend(
min( M, ii+block ) );
3276 for(
size_t j=jj; j<jend; ++j )
3278 const size_t ibegin( ( IsLower_v<MT4> )
3279 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3281 const size_t ipos( ( IsUpper_v<MT4> )
3282 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3285 for(
size_t i=ibegin; i<ipos; ++i ) {
3286 C(i,j) -= A(i,j) * B(j,j);
3309 template<
typename MT3
3312 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3313 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3315 constexpr
size_t block( BLOCK_SIZE );
3317 const size_t M( A.rows() );
3318 const size_t N( B.columns() );
3320 for(
size_t ii=0UL; ii<M; ii+=block ) {
3321 const size_t iend(
min( M, ii+block ) );
3322 for(
size_t jj=0UL; jj<N; jj+=block ) {
3323 const size_t jend(
min( N, jj+block ) );
3324 for(
size_t i=ii; i<iend; ++i )
3326 const size_t jbegin( ( IsUpper_v<MT5> )
3327 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3329 const size_t jpos( ( IsLower_v<MT5> )
3330 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3333 for(
size_t j=jbegin; j<jpos; ++j ) {
3334 C(i,j) -= A(i,i) * B(i,j);
3357 template<
typename MT3
3360 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3361 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3363 const size_t M( A.rows() );
3364 const size_t N( B.columns() );
3366 for(
size_t j=0UL; j<N; ++j )
3368 const size_t ibegin( ( IsLower_v<MT5> )
3369 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3371 const size_t iend( ( IsUpper_v<MT5> )
3372 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3376 const size_t inum( iend - ibegin );
3377 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3379 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3380 C(i ,j) -= A(i ,i ) * B(i ,j);
3381 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3384 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3405 template<
typename MT3
3408 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3409 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3411 for(
size_t i=0UL; i<A.rows(); ++i ) {
3412 C(i,i) -= A(i,i) * B(i,i);
3432 template<
typename MT3
3435 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3436 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3438 selectDefaultSubAssignKernel( C, A, B );
3458 template<
typename MT3
3461 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3462 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3464 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3466 const size_t M( A.rows() );
3467 const size_t N( B.columns() );
3468 const size_t K( A.columns() );
3474 for( ; (i+2UL) <= M; i+=2UL )
3476 const size_t jend(
LOW ? i+2UL : N );
3477 size_t j(
UPP ? i : 0UL );
3479 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3481 const size_t kbegin( ( IsUpper_v<MT4> )
3482 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3483 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3484 const size_t kend( ( IsLower_v<MT4> )
3485 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3486 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3488 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3491 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3496 const SIMDType a2( A.load(i+1UL,k) );
3498 const SIMDType b2( B.load(k,j+1UL) );
3499 const SIMDType b3( B.load(k,j+2UL) );
3500 const SIMDType b4( B.load(k,j+3UL) );
3511 C(i ,j ) -=
sum( xmm1 );
3512 C(i ,j+1UL) -=
sum( xmm2 );
3513 C(i ,j+2UL) -=
sum( xmm3 );
3514 C(i ,j+3UL) -=
sum( xmm4 );
3515 C(i+1UL,j ) -=
sum( xmm5 );
3516 C(i+1UL,j+1UL) -=
sum( xmm6 );
3517 C(i+1UL,j+2UL) -=
sum( xmm7 );
3518 C(i+1UL,j+3UL) -=
sum( xmm8 );
3520 for( ; remainder && k<kend; ++k ) {
3521 C(i ,j ) -= A(i ,k) * B(k,j );
3522 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3523 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3524 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3525 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3526 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3527 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3528 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3532 for( ; (j+2UL) <= jend; j+=2UL )
3534 const size_t kbegin( ( IsUpper_v<MT4> )
3535 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3536 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3537 const size_t kend( ( IsLower_v<MT4> )
3538 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3539 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3541 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3549 const SIMDType a2( A.load(i+1UL,k) );
3551 const SIMDType b2( B.load(k,j+1UL) );
3558 C(i ,j ) -=
sum( xmm1 );
3559 C(i ,j+1UL) -=
sum( xmm2 );
3560 C(i+1UL,j ) -=
sum( xmm3 );
3561 C(i+1UL,j+1UL) -=
sum( xmm4 );
3563 for( ; remainder && k<kend; ++k ) {
3564 C(i ,j ) -= A(i ,k) * B(k,j );
3565 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3566 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3567 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3573 const size_t kbegin( ( IsUpper_v<MT4> )
3574 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3575 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3576 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3578 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3586 xmm1 += A.load(i ,k) * b1;
3587 xmm2 += A.load(i+1UL,k) * b1;
3590 C(i ,j) -=
sum( xmm1 );
3591 C(i+1UL,j) -=
sum( xmm2 );
3593 for( ; remainder && k<kend; ++k ) {
3594 C(i ,j) -= A(i ,k) * B(k,j);
3595 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3602 const size_t jend(
LOW ? i+1UL : N );
3603 size_t j(
UPP ? i : 0UL );
3605 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3607 const size_t kbegin( ( IsUpper_v<MT4> )
3608 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3609 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3610 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3612 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3620 xmm1 += a1 * B.load(k,j );
3621 xmm2 += a1 * B.load(k,j+1UL);
3622 xmm3 += a1 * B.load(k,j+2UL);
3623 xmm4 += a1 * B.load(k,j+3UL);
3626 C(i,j ) -=
sum( xmm1 );
3627 C(i,j+1UL) -=
sum( xmm2 );
3628 C(i,j+2UL) -=
sum( xmm3 );
3629 C(i,j+3UL) -=
sum( xmm4 );
3631 for( ; remainder && k<kend; ++k ) {
3632 C(i,j ) -= A(i,k) * B(k,j );
3633 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3634 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3635 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3639 for( ; (j+2UL) <= jend; j+=2UL )
3641 const size_t kbegin( ( IsUpper_v<MT4> )
3642 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3643 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3644 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3646 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3654 xmm1 += a1 * B.load(k,j );
3655 xmm2 += a1 * B.load(k,j+1UL);
3658 C(i,j ) -=
sum( xmm1 );
3659 C(i,j+1UL) -=
sum( xmm2 );
3661 for( ; remainder && k<kend; ++k ) {
3662 C(i,j ) -= A(i,k) * B(k,j );
3663 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3669 const size_t kbegin( ( IsUpper_v<MT4> )
3670 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3671 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3673 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
3680 xmm1 += A.load(i,k) * B.load(k,j);
3683 C(i,j) -=
sum( xmm1 );
3685 for( ; remainder && k<K; ++k ) {
3686 C(i,j) -= A(i,k) * B(k,j);
3709 template<
typename MT3
3712 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3713 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3715 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3717 const size_t M( A.rows() );
3718 const size_t N( B.columns() );
3719 const size_t K( A.columns() );
3725 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
3729 for( ; (j+2UL) <= N; j+=2UL )
3731 const size_t kbegin( ( IsUpper_v<MT4> )
3732 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3733 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3734 const size_t kend( ( IsLower_v<MT4> )
3735 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3736 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3738 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3741 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3746 const SIMDType a2( A.load(i+1UL,k) );
3747 const SIMDType a3( A.load(i+2UL,k) );
3748 const SIMDType a4( A.load(i+3UL,k) );
3750 const SIMDType b2( B.load(k,j+1UL) );
3761 C(i ,j ) -=
sum( xmm1 );
3762 C(i ,j+1UL) -=
sum( xmm2 );
3763 C(i+1UL,j ) -=
sum( xmm3 );
3764 C(i+1UL,j+1UL) -=
sum( xmm4 );
3765 C(i+2UL,j ) -=
sum( xmm5 );
3766 C(i+2UL,j+1UL) -=
sum( xmm6 );
3767 C(i+3UL,j ) -=
sum( xmm7 );
3768 C(i+3UL,j+1UL) -=
sum( xmm8 );
3770 for( ; remainder && k<kend; ++k ) {
3771 C(i ,j ) -= A(i ,k) * B(k,j );
3772 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3773 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3774 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3775 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3776 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3777 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3778 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3784 const size_t kbegin( ( IsUpper_v<MT4> )
3785 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3786 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3787 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
3789 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3797 xmm1 += A.load(i ,k) * b1;
3798 xmm2 += A.load(i+1UL,k) * b1;
3799 xmm3 += A.load(i+2UL,k) * b1;
3800 xmm4 += A.load(i+3UL,k) * b1;
3803 C(i ,j) -=
sum( xmm1 );
3804 C(i+1UL,j) -=
sum( xmm2 );
3805 C(i+2UL,j) -=
sum( xmm3 );
3806 C(i+3UL,j) -=
sum( xmm4 );
3808 for( ; remainder && k<kend; ++k ) {
3809 C(i ,j ) -= A(i ,k) * B(k,j );
3810 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3811 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3812 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3817 for( ; (i+2UL) <= M; i+=2UL )
3819 const size_t jend(
LOW ? i+2UL : N );
3820 size_t j(
UPP ? i : 0UL );
3822 for( ; (j+2UL) <= jend; j+=2UL )
3824 const size_t kbegin( ( IsUpper_v<MT4> )
3825 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3826 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3827 const size_t kend( ( IsLower_v<MT4> )
3828 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3829 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3831 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3839 const SIMDType a2( A.load(i+1UL,k) );
3841 const SIMDType b2( B.load(k,j+1UL) );
3848 C(i ,j ) -=
sum( xmm1 );
3849 C(i ,j+1UL) -=
sum( xmm2 );
3850 C(i+1UL,j ) -=
sum( xmm3 );
3851 C(i+1UL,j+1UL) -=
sum( xmm4 );
3853 for( ; remainder && k<kend; ++k ) {
3854 C(i ,j ) -= A(i ,k) * B(k,j );
3855 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3856 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3857 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3863 const size_t kbegin( ( IsUpper_v<MT4> )
3864 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3865 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3866 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3868 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3876 xmm1 += A.load(i ,k) * b1;
3877 xmm2 += A.load(i+1UL,k) * b1;
3880 C(i ,j) -=
sum( xmm1 );
3881 C(i+1UL,j) -=
sum( xmm2 );
3883 for( ; remainder && k<kend; ++k ) {
3884 C(i ,j) -= A(i ,k) * B(k,j);
3885 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3892 const size_t jend(
LOW ? i+1UL : N );
3893 size_t j(
UPP ? i : 0UL );
3895 for( ; (j+2UL) <= jend; j+=2UL )
3897 const size_t kbegin( ( IsUpper_v<MT4> )
3898 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3899 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3900 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3902 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3910 xmm1 += a1 * B.load(k,j );
3911 xmm2 += a1 * B.load(k,j+1UL);
3914 C(i,j ) -=
sum( xmm1 );
3915 C(i,j+1UL) -=
sum( xmm2 );
3917 for( ; remainder && k<kend; ++k ) {
3918 C(i,j ) -= A(i,k) * B(k,j );
3919 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3925 const size_t kbegin( ( IsUpper_v<MT4> )
3926 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3927 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3929 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
3936 xmm1 += A.load(i,k) * B.load(k,j);
3939 C(i,j) -=
sum( xmm1 );
3941 for( ; remainder && k<K; ++k ) {
3942 C(i,j) -= A(i,k) * B(k,j);
3964 template<
typename MT3
3967 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3968 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3970 selectDefaultSubAssignKernel( C, A, B );
3990 template<
typename MT3
3993 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3994 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4020 template<
typename MT3
4023 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4024 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4026 selectLargeSubAssignKernel( C, A, B );
4032 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4046 template<
typename MT3
4049 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4050 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4052 using ET = ElementType_t<MT3>;
4054 if( IsTriangular_v<MT4> ) {
4055 ResultType_t<MT3> tmp(
serial( B ) );
4056 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4057 subAssign( C, tmp );
4059 else if( IsTriangular_v<MT5> ) {
4060 ResultType_t<MT3> tmp(
serial( A ) );
4061 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4062 subAssign( C, tmp );
4065 gemm( C, A, B, ET(-1), ET(1) );
4089 template<
typename MT
4091 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
4103 schurAssign( ~lhs, tmp );
4135 template<
typename MT
4138 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4145 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4148 else if( rhs.lhs_.columns() == 0UL ) {
4183 template<
typename MT
4186 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4190 using TmpType = If_t< SO, OppositeType, ResultType >;
4202 const ForwardFunctor fwd;
4204 const TmpType tmp( rhs );
4226 template<
typename MT
4229 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4236 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4275 template<
typename MT
4278 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4285 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4321 template<
typename MT
4381 template<
typename MT1
4388 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4389 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4390 ,
private Computation
4395 using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4397 using RES = ResultType_t<MMM>;
4398 using RT1 = ResultType_t<MT1>;
4399 using RT2 = ResultType_t<MT2>;
4400 using ET1 = ElementType_t<RT1>;
4401 using ET2 = ElementType_t<RT2>;
4402 using CT1 = CompositeType_t<MT1>;
4403 using CT2 = CompositeType_t<MT2>;
4408 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4413 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4417 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4418 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4419 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4420 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4428 template<
typename T1,
typename T2,
typename T3 >
4429 static constexpr
bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
4436 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4437 static constexpr
bool UseBlasKernel_v =
4439 !SYM && !HERM && !LOW && !UPP &&
4440 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4441 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4442 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4443 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4444 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4445 IsBLASCompatible_v< ElementType_t<T1> > &&
4446 IsBLASCompatible_v< ElementType_t<T2> > &&
4447 IsBLASCompatible_v< ElementType_t<T3> > &&
4448 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4449 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4450 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4457 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4458 static constexpr
bool UseVectorizedDefaultKernel_v =
4459 ( useOptimizedKernels &&
4460 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4461 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4462 IsSIMDCombinable_v< ElementType_t<T1>
4466 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4467 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4474 using ForwardFunctor =
If_t< HERM
4490 using This = DMatScalarMultExpr<MMM,ST,false>;
4493 using BaseType = DenseMatrix<This,false>;
4497 , DeclHermTrait< MultTrait_t<RES,ST> >
4499 , DeclSymTrait< MultTrait_t<RES,ST> >
4502 , DeclDiagTrait< MultTrait_t<RES,ST> >
4503 , DeclLowTrait< MultTrait_t<RES,ST> > >
4505 , DeclUppTrait< MultTrait_t<RES,ST> >
4506 , MultTrait<RES,ST> > > > >::Type;
4511 using SIMDType = SIMDTrait_t<ElementType>;
4516 using LeftOperand =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4522 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4525 using RT = If_t< evaluateRight, const RT2, CT2 >;
4531 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
4532 MT1::simdEnabled && MT2::simdEnabled &&
4533 IsSIMDCombinable_v<ET1,ET2,ST> &&
4534 HasSIMDAdd_v<ET1,ET2> &&
4535 HasSIMDMult_v<ET1,ET2> );
4539 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4585 if( j >=
matrix_.columns() ) {
4588 return (*
this)(i,j);
4597 inline size_t rows()
const {
4607 inline size_t columns()
const {
4638 template<
typename T >
4639 inline bool canAlias(
const T* alias )
const {
4640 return matrix_.canAlias( alias );
4650 template<
typename T >
4651 inline bool isAliased(
const T* alias )
const {
4652 return matrix_.isAliased( alias );
4673 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4675 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4676 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4698 template<
typename MT
4707 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4708 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4710 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4713 else if( left.columns() == 0UL ) {
4728 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4743 template<
typename MT3
4747 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4749 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4750 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4751 selectSmallAssignKernel( C, A, B, scalar );
4753 selectBlasAssignKernel( C, A, B, scalar );
4771 template<
typename MT3
4775 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4776 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4778 const size_t M( A.rows() );
4779 const size_t N( B.columns() );
4780 const size_t K( A.columns() );
4784 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4785 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4787 const size_t iend( ( IsStrictlyUpper_v<MT4> )
4788 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4792 for(
size_t i=0UL; i<ibegin; ++i ) {
4793 for(
size_t j=0UL; j<N; ++j ) {
4797 for(
size_t i=ibegin; i<iend; ++i )
4799 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4800 ?( ( IsStrictlyUpper_v<MT4> )
4801 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4802 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4803 :( ( IsStrictlyUpper_v<MT5> )
4804 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
4805 :( SYM || HERM || UPP ? i : 0UL ) ) );
4806 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4807 ?( ( IsStrictlyLower_v<MT4> )
4808 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4809 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4810 :( ( IsStrictlyLower_v<MT5> )
4811 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
4812 :( LOW ? i+1UL : N ) ) );
4814 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4815 for(
size_t j=0UL; j<N; ++j ) {
4823 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4826 for(
size_t j=jbegin; j<jend; ++j )
4828 const size_t kbegin( ( IsUpper_v<MT4> )
4829 ?( ( IsLower_v<MT5> )
4830 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4831 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4832 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4833 :( ( IsLower_v<MT5> )
4834 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4836 const size_t kend( ( IsLower_v<MT4> )
4837 ?( ( IsUpper_v<MT5> )
4838 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4839 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4840 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4841 :( ( IsUpper_v<MT5> )
4842 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4846 C(i,j) = A(i,kbegin) * B(kbegin,j);
4847 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4848 C(i,j) += A(i,k) * B(k,j);
4852 for(
size_t j=jend; j<N; ++j ) {
4856 for(
size_t i=iend; i<M; ++i ) {
4857 for(
size_t j=0UL; j<N; ++j ) {
4863 for(
size_t i=1UL; i<M; ++i ) {
4864 for(
size_t j=0UL; j<i; ++j ) {
4865 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
4886 template<
typename MT3
4890 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4891 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4893 const size_t M( A.rows() );
4894 const size_t N( B.columns() );
4895 const size_t K( A.columns() );
4899 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4900 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4902 const size_t jend( ( IsStrictlyLower_v<MT5> )
4903 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4907 for(
size_t j=0UL; j<jbegin; ++j ) {
4908 for(
size_t i=0UL; i<M; ++i ) {
4912 for(
size_t j=jbegin; j<jend; ++j )
4914 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4915 ?( ( IsStrictlyLower_v<MT4> )
4916 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4917 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4918 :( ( IsStrictlyLower_v<MT4> )
4919 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
4920 :( SYM || HERM || LOW ? j : 0UL ) ) );
4921 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4922 ?( ( IsStrictlyUpper_v<MT4> )
4923 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4924 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4925 :( ( IsStrictlyUpper_v<MT4> )
4926 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
4927 :( UPP ? j+1UL : M ) ) );
4929 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4930 for(
size_t i=0UL; i<M; ++i ) {
4938 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4941 for(
size_t i=ibegin; i<iend; ++i )
4943 const size_t kbegin( ( IsUpper_v<MT4> )
4944 ?( ( IsLower_v<MT5> )
4945 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4946 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4947 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4948 :( ( IsLower_v<MT5> )
4949 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4951 const size_t kend( ( IsLower_v<MT4> )
4952 ?( ( IsUpper_v<MT5> )
4953 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4954 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4955 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4956 :( ( IsUpper_v<MT5> )
4957 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4961 C(i,j) = A(i,kbegin) * B(kbegin,j);
4962 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4963 C(i,j) += A(i,k) * B(k,j);
4967 for(
size_t i=iend; i<M; ++i ) {
4971 for(
size_t j=jend; j<N; ++j ) {
4972 for(
size_t i=0UL; i<M; ++i ) {
4978 for(
size_t j=1UL; j<N; ++j ) {
4979 for(
size_t i=0UL; i<j; ++i ) {
4980 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5001 template<
typename MT3
5005 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5006 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5008 const size_t M( A.rows() );
5009 const size_t N( B.columns() );
5011 for(
size_t i=0UL; i<M; ++i )
5013 const size_t jbegin( ( IsUpper_v<MT4> )
5014 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5016 const size_t jend( ( IsLower_v<MT4> )
5017 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5021 if( IsUpper_v<MT4> ) {
5022 for(
size_t j=0UL; j<jbegin; ++j ) {
5026 for(
size_t j=jbegin; j<jend; ++j ) {
5027 C(i,j) = A(i,j) * B(j,j) * scalar;
5029 if( IsLower_v<MT4> ) {
5030 for(
size_t j=jend; j<N; ++j ) {
5052 template<
typename MT3
5056 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5057 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5059 constexpr
size_t block( BLOCK_SIZE );
5061 const size_t M( A.rows() );
5062 const size_t N( B.columns() );
5064 for(
size_t jj=0UL; jj<N; jj+=block ) {
5065 const size_t jend(
min( N, jj+block ) );
5066 for(
size_t ii=0UL; ii<M; ii+=block ) {
5067 const size_t iend(
min( M, ii+block ) );
5068 for(
size_t j=jj; j<jend; ++j )
5070 const size_t ibegin( ( IsLower_v<MT4> )
5071 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
5073 const size_t ipos( ( IsUpper_v<MT4> )
5074 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
5077 if( IsLower_v<MT4> ) {
5078 for(
size_t i=ii; i<ibegin; ++i ) {
5082 for(
size_t i=ibegin; i<ipos; ++i ) {
5083 C(i,j) = A(i,j) * B(j,j) * scalar;
5085 if( IsUpper_v<MT4> ) {
5086 for(
size_t i=ipos; i<iend; ++i ) {
5110 template<
typename MT3
5114 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5115 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5117 constexpr
size_t block( BLOCK_SIZE );
5119 const size_t M( A.rows() );
5120 const size_t N( B.columns() );
5122 for(
size_t ii=0UL; ii<M; ii+=block ) {
5123 const size_t iend(
min( M, ii+block ) );
5124 for(
size_t jj=0UL; jj<N; jj+=block ) {
5125 const size_t jend(
min( N, jj+block ) );
5126 for(
size_t i=ii; i<iend; ++i )
5128 const size_t jbegin( ( IsUpper_v<MT5> )
5129 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
5131 const size_t jpos( ( IsLower_v<MT5> )
5132 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
5135 if( IsUpper_v<MT5> ) {
5136 for(
size_t j=jj; j<jbegin; ++j ) {
5140 for(
size_t j=jbegin; j<jpos; ++j ) {
5141 C(i,j) = A(i,i) * B(i,j) * scalar;
5143 if( IsLower_v<MT5> ) {
5144 for(
size_t j=jpos; j<jend; ++j ) {
5168 template<
typename MT3
5172 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5173 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5175 const size_t M( A.rows() );
5176 const size_t N( B.columns() );
5178 for(
size_t j=0UL; j<N; ++j )
5180 const size_t ibegin( ( IsLower_v<MT5> )
5181 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5183 const size_t iend( ( IsUpper_v<MT5> )
5184 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5188 if( IsLower_v<MT5> ) {
5189 for(
size_t i=0UL; i<ibegin; ++i ) {
5193 for(
size_t i=ibegin; i<iend; ++i ) {
5194 C(i,j) = A(i,i) * B(i,j) * scalar;
5196 if( IsUpper_v<MT5> ) {
5197 for(
size_t i=iend; i<M; ++i ) {
5219 template<
typename MT3
5223 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5224 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5228 for(
size_t i=0UL; i<A.rows(); ++i ) {
5229 C(i,i) = A(i,i) * B(i,i) * scalar;
5248 template<
typename MT3
5252 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5253 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5255 selectDefaultAssignKernel( C, A, B, scalar );
5274 template<
typename MT3
5278 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5279 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5281 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5283 const size_t M( A.rows() );
5284 const size_t N( B.columns() );
5285 const size_t K( A.columns() );
5291 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5293 const size_t jend( LOW ? i+2UL : N );
5298 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
5299 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
5305 reset( C(i+1UL,j) );
5309 for( ; (j+4UL) <= jend; j+=4UL )
5311 const size_t kbegin( ( IsUpper_v<MT4> )
5312 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5313 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5314 const size_t kend( ( IsLower_v<MT4> )
5315 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5316 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5318 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5321 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5325 const SIMDType a1( A.load(i ,k) );
5326 const SIMDType a2( A.load(i+1UL,k) );
5327 const SIMDType b1( B.load(k,j ) );
5328 const SIMDType b2( B.load(k,j+1UL) );
5329 const SIMDType b3( B.load(k,j+2UL) );
5330 const SIMDType b4( B.load(k,j+3UL) );
5341 C(i ,j ) =
sum( xmm1 ) * scalar;
5342 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5343 C(i ,j+2UL) =
sum( xmm3 ) * scalar;
5344 C(i ,j+3UL) =
sum( xmm4 ) * scalar;
5345 C(i+1UL,j ) =
sum( xmm5 ) * scalar;
5346 C(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
5347 C(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
5348 C(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
5350 for( ; remainder && k<kend; ++k ) {
5351 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5352 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5353 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5354 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5355 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5356 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5357 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5358 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5362 for( ; (j+2UL) <= jend; j+=2UL )
5364 const size_t kbegin( ( IsUpper_v<MT4> )
5365 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5366 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5367 const size_t kend( ( IsLower_v<MT4> )
5368 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5369 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5371 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5374 SIMDType xmm1, xmm2, xmm3, xmm4;
5378 const SIMDType a1( A.load(i ,k) );
5379 const SIMDType a2( A.load(i+1UL,k) );
5380 const SIMDType b1( B.load(k,j ) );
5381 const SIMDType b2( B.load(k,j+1UL) );
5388 C(i ,j ) =
sum( xmm1 ) * scalar;
5389 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5390 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5391 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5393 for( ; remainder && k<kend; ++k ) {
5394 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5395 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5396 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5397 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5403 const size_t kbegin( ( IsUpper_v<MT4> )
5404 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5405 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5406 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5408 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5411 SIMDType xmm1, xmm2;
5415 const SIMDType b1( B.load(k,j) );
5416 xmm1 += A.load(i ,k) * b1;
5417 xmm2 += A.load(i+1UL,k) * b1;
5420 C(i ,j) =
sum( xmm1 ) * scalar;
5421 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5423 for( ; remainder && k<kend; ++k ) {
5424 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5425 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5434 reset( C(i+1UL,j) );
5441 const size_t jend( LOW ? i+1UL : N );
5446 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5455 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
5457 const size_t kbegin( ( IsUpper_v<MT4> )
5458 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5459 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5460 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5462 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5465 SIMDType xmm1, xmm2, xmm3, xmm4;
5469 const SIMDType a1( A.load(i,k) );
5470 xmm1 += a1 * B.load(k,j );
5471 xmm2 += a1 * B.load(k,j+1UL);
5472 xmm3 += a1 * B.load(k,j+2UL);
5473 xmm4 += a1 * B.load(k,j+3UL);
5476 C(i,j ) =
sum( xmm1 ) * scalar;
5477 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5478 C(i,j+2UL) =
sum( xmm3 ) * scalar;
5479 C(i,j+3UL) =
sum( xmm4 ) * scalar;
5481 for( ; remainder && k<kend; ++k ) {
5482 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5483 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5484 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5485 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5489 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
5491 const size_t kbegin( ( IsUpper_v<MT4> )
5492 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5493 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5494 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5496 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5499 SIMDType xmm1, xmm2;
5503 const SIMDType a1( A.load(i,k) );
5504 xmm1 += a1 * B.load(k,j );
5505 xmm2 += a1 * B.load(k,j+1UL);
5508 C(i,j ) =
sum( xmm1 ) * scalar;
5509 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5511 for( ; remainder && k<kend; ++k ) {
5512 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5513 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5519 const size_t kbegin( ( IsUpper_v<MT4> )
5520 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5521 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5523 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
5530 xmm1 += A.load(i,k) * B.load(k,j);
5533 C(i,j) =
sum( xmm1 ) * scalar;
5535 for( ; remainder && k<K; ++k ) {
5536 C(i,j) += A(i,k) * B(k,j) * scalar;
5566 template<
typename MT3
5570 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5571 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5573 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5575 const size_t M( A.rows() );
5576 const size_t N( B.columns() );
5577 const size_t K( A.columns() );
5583 for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
5585 const size_t jend( LOW ? i+4UL : N );
5590 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
5591 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
5592 C(i+2UL,j) = HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
5593 C(i+3UL,j) = HERM ?
conj( C(j,i+3UL) ) : C(j,i+3UL);
5599 reset( C(i+1UL,j) );
5600 reset( C(i+2UL,j) );
5601 reset( C(i+3UL,j) );
5605 for( ; (j+2UL) <= jend; j+=2UL )
5607 const size_t kbegin( ( IsUpper_v<MT4> )
5608 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5609 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5610 const size_t kend( ( IsLower_v<MT4> )
5611 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5612 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5614 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5617 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5621 const SIMDType a1( A.load(i ,k) );
5622 const SIMDType a2( A.load(i+1UL,k) );
5623 const SIMDType a3( A.load(i+2UL,k) );
5624 const SIMDType a4( A.load(i+3UL,k) );
5625 const SIMDType b1( B.load(k,j ) );
5626 const SIMDType b2( B.load(k,j+1UL) );
5637 C(i ,j ) =
sum( xmm1 ) * scalar;
5638 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5639 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5640 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5641 C(i+2UL,j ) =
sum( xmm5 ) * scalar;
5642 C(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5643 C(i+3UL,j ) =
sum( xmm7 ) * scalar;
5644 C(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5646 for( ; remainder && k<kend; ++k ) {
5647 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5648 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5649 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5650 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5651 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5652 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5653 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5654 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5660 const size_t kbegin( ( IsUpper_v<MT4> )
5661 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5662 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5663 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5665 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5668 SIMDType xmm1, xmm2, xmm3, xmm4;
5672 const SIMDType b1( B.load(k,j) );
5673 xmm1 += A.load(i ,k) * b1;
5674 xmm2 += A.load(i+1UL,k) * b1;
5675 xmm3 += A.load(i+2UL,k) * b1;
5676 xmm4 += A.load(i+3UL,k) * b1;
5679 C(i ,j) =
sum( xmm1 ) * scalar;
5680 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5681 C(i+2UL,j) =
sum( xmm3 ) * scalar;
5682 C(i+3UL,j) =
sum( xmm4 ) * scalar;
5684 for( ; remainder && k<kend; ++k ) {
5685 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5686 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5687 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5688 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5697 reset( C(i+1UL,j) );
5698 reset( C(i+2UL,j) );
5699 reset( C(i+3UL,j) );
5704 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5706 const size_t jend( LOW ? i+2UL : N );
5711 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
5712 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
5718 reset( C(i+1UL,j) );
5722 for( ; (j+2UL) <= jend; j+=2UL )
5724 const size_t kbegin( ( IsUpper_v<MT4> )
5725 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5726 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5727 const size_t kend( ( IsLower_v<MT4> )
5728 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5729 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5731 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5734 SIMDType xmm1, xmm2, xmm3, xmm4;
5738 const SIMDType a1( A.load(i ,k) );
5739 const SIMDType a2( A.load(i+1UL,k) );
5740 const SIMDType b1( B.load(k,j ) );
5741 const SIMDType b2( B.load(k,j+1UL) );
5748 C(i ,j ) =
sum( xmm1 ) * scalar;
5749 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5750 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5751 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5753 for( ; remainder && k<kend; ++k ) {
5754 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5755 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5756 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5757 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5763 const size_t kbegin( ( IsUpper_v<MT4> )
5764 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5765 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5766 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5768 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5771 SIMDType xmm1, xmm2;
5775 const SIMDType b1( B.load(k,j) );
5776 xmm1 += A.load(i ,k) * b1;
5777 xmm2 += A.load(i+1UL,k) * b1;
5780 C(i ,j) =
sum( xmm1 ) * scalar;
5781 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5783 for( ; remainder && k<kend; ++k ) {
5784 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5785 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5794 reset( C(i+1UL,j) );
5801 const size_t jend( LOW ? i+1UL : N );
5806 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5815 for( ; (j+2UL) <= jend; j+=2UL )
5817 const size_t kbegin( ( IsUpper_v<MT4> )
5818 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5819 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5820 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5822 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5825 SIMDType xmm1, xmm2;
5829 const SIMDType a1( A.load(i,k) );
5830 xmm1 += a1 * B.load(k,j );
5831 xmm2 += a1 * B.load(k,j+1UL);
5834 C(i,j ) =
sum( xmm1 ) * scalar;
5835 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5837 for( ; remainder && k<kend; ++k ) {
5838 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5839 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5845 const size_t kbegin( ( IsUpper_v<MT4> )
5846 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5847 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5849 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
5856 xmm1 += A.load(i,k) * B.load(k,j);
5859 C(i,j) =
sum( xmm1 ) * scalar;
5861 for( ; remainder && k<K; ++k ) {
5862 C(i,j) += A(i,k) * B(k,j) * scalar;
5891 template<
typename MT3
5895 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5896 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5898 selectDefaultAssignKernel( C, A, B, scalar );
5917 template<
typename MT3
5921 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5922 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5925 smmm( C, A, B, scalar );
5927 hmmm( C, A, B, scalar );
5929 lmmm( C, A, B, scalar, ST2(0) );
5931 ummm( C, A, B, scalar, ST2(0) );
5933 mmm( C, A, B, scalar, ST2(0) );
5951 template<
typename MT3
5955 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5956 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5958 selectLargeAssignKernel( C, A, B, scalar );
5963 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5977 template<
typename MT3
5981 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5982 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5984 using ET = ElementType_t<MT3>;
5986 if( IsTriangular_v<MT4> ) {
5988 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
5990 else if( IsTriangular_v<MT5> ) {
5992 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
5995 gemm( C, A, B,
ET(scalar),
ET(0) );
6013 template<
typename MT
6019 using TmpType = If_t< SO, OppositeType, ResultType >;
6031 const ForwardFunctor fwd;
6033 const TmpType tmp(
serial( rhs ) );
6034 assign( ~lhs, fwd( tmp ) );
6050 template<
typename MT
6052 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6059 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6060 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6062 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6076 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6091 template<
typename MT3
6095 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6097 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
6098 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6099 selectSmallAddAssignKernel( C, A, B, scalar );
6101 selectBlasAddAssignKernel( C, A, B, scalar );
6119 template<
typename MT3
6123 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6124 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6127 addAssign( C, tmp );
6145 template<
typename MT3
6149 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6150 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6152 const size_t M( A.rows() );
6153 const size_t N( B.columns() );
6155 for(
size_t i=0UL; i<M; ++i )
6157 const size_t jbegin( ( IsUpper_v<MT4> )
6158 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6160 const size_t jend( ( IsLower_v<MT4> )
6161 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6165 const size_t jnum( jend - jbegin );
6166 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6168 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6169 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6170 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6173 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6193 template<
typename MT3
6197 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6198 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6200 constexpr
size_t block( BLOCK_SIZE );
6202 const size_t M( A.rows() );
6203 const size_t N( B.columns() );
6205 for(
size_t jj=0UL; jj<N; jj+=block ) {
6206 const size_t jend(
min( N, jj+block ) );
6207 for(
size_t ii=0UL; ii<M; ii+=block ) {
6208 const size_t iend(
min( M, ii+block ) );
6209 for(
size_t j=jj; j<jend; ++j )
6211 const size_t ibegin( ( IsLower_v<MT4> )
6212 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
6214 const size_t ipos( ( IsUpper_v<MT4> )
6215 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
6218 for(
size_t i=ibegin; i<ipos; ++i ) {
6219 C(i,j) += A(i,j) * B(j,j) * scalar;
6241 template<
typename MT3
6245 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6246 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6248 constexpr
size_t block( BLOCK_SIZE );
6250 const size_t M( A.rows() );
6251 const size_t N( B.columns() );
6253 for(
size_t ii=0UL; ii<M; ii+=block ) {
6254 const size_t iend(
min( M, ii+block ) );
6255 for(
size_t jj=0UL; jj<N; jj+=block ) {
6256 const size_t jend(
min( N, jj+block ) );
6257 for(
size_t i=ii; i<iend; ++i )
6259 const size_t jbegin( ( IsUpper_v<MT5> )
6260 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
6262 const size_t jpos( ( IsLower_v<MT5> )
6263 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
6266 for(
size_t j=jbegin; j<jpos; ++j ) {
6267 C(i,j) += A(i,i) * B(i,j) * scalar;
6289 template<
typename MT3
6293 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6294 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6296 const size_t M( A.rows() );
6297 const size_t N( B.columns() );
6299 for(
size_t j=0UL; j<N; ++j )
6301 const size_t ibegin( ( IsLower_v<MT5> )
6302 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6304 const size_t iend( ( IsUpper_v<MT5> )
6305 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6309 const size_t inum( iend - ibegin );
6310 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6312 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6313 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6314 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6317 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6337 template<
typename MT3
6341 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6342 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6344 for(
size_t i=0UL; i<A.rows(); ++i ) {
6345 C(i,i) += A(i,i) * B(i,i) * scalar;
6364 template<
typename MT3
6368 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6369 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6371 selectDefaultAddAssignKernel( C, A, B, scalar );
6390 template<
typename MT3
6394 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6395 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6397 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6399 const size_t M( A.rows() );
6400 const size_t N( B.columns() );
6401 const size_t K( A.columns() );
6407 for( ; (i+2UL) <= M; i+=2UL )
6409 const size_t jend( LOW ? i+2UL : N );
6410 size_t j( UPP ? i : 0UL );
6412 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6414 const size_t kbegin( ( IsUpper_v<MT4> )
6415 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6416 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6417 const size_t kend( ( IsLower_v<MT4> )
6418 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
6419 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
6421 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6424 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6428 const SIMDType a1( A.load(i ,k) );
6429 const SIMDType a2( A.load(i+1UL,k) );
6430 const SIMDType b1( B.load(k,j ) );
6431 const SIMDType b2( B.load(k,j+1UL) );
6432 const SIMDType b3( B.load(k,j+2UL) );
6433 const SIMDType b4( B.load(k,j+3UL) );
6444 C(i ,j ) +=
sum( xmm1 ) * scalar;
6445 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6446 C(i ,j+2UL) +=
sum( xmm3 ) * scalar;
6447 C(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6448 C(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6449 C(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6450 C(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6451 C(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6453 for( ; remainder && k<kend; ++k ) {
6454 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6455 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6456 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6457 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6458 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6459 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6460 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6461 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6465 for( ; (j+2UL) <= jend; j+=2UL )
6467 const size_t kbegin( ( IsUpper_v<MT4> )
6468 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6469 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6470 const size_t kend( ( IsLower_v<MT4> )
6471 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6472 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6474 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6477 SIMDType xmm1, xmm2, xmm3, xmm4;
6481 const SIMDType a1( A.load(i ,k) );
6482 const SIMDType a2( A.load(i+1UL,k) );
6483 const SIMDType b1( B.load(k,j ) );
6484 const SIMDType b2( B.load(k,j+1UL) );
6491 C(i ,j ) +=
sum( xmm1 ) * scalar;
6492 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6493 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6494 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6496 for( ; remainder && k<kend; ++k ) {
6497 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6498 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6499 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6500 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6506 const size_t kbegin( ( IsUpper_v<MT4> )
6507 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6508 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6509 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6511 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6514 SIMDType xmm1, xmm2;
6518 const SIMDType b1( B.load(k,j) );
6519 xmm1 += A.load(i ,k) * b1;
6520 xmm2 += A.load(i+1UL,k) * b1;
6523 C(i ,j) +=
sum( xmm1 ) * scalar;
6524 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6526 for( ; remainder && k<kend; ++k ) {
6527 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6528 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6535 const size_t jend( LOW ? i+1UL : N );
6536 size_t j( UPP ? i : 0UL );
6538 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6540 const size_t kbegin( ( IsUpper_v<MT4> )
6541 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6542 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6543 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
6545 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6548 SIMDType xmm1, xmm2, xmm3, xmm4;
6552 const SIMDType a1( A.load(i,k) );
6553 xmm1 += a1 * B.load(k,j );
6554 xmm2 += a1 * B.load(k,j+1UL);
6555 xmm3 += a1 * B.load(k,j+2UL);
6556 xmm4 += a1 * B.load(k,j+3UL);
6559 C(i,j ) +=
sum( xmm1 ) * scalar;
6560 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6561 C(i,j+2UL) +=
sum( xmm3 ) * scalar;
6562 C(i,j+3UL) +=
sum( xmm4 ) * scalar;
6564 for( ; remainder && k<kend; ++k ) {
6565 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6566 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6567 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6568 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6572 for( ; (j+2UL) <= jend; j+=2UL )
6574 const size_t kbegin( ( IsUpper_v<MT4> )
6575 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6576 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6577 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6579 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6582 SIMDType xmm1, xmm2;
6586 const SIMDType a1( A.load(i,k) );
6587 xmm1 += a1 * B.load(k,j );
6588 xmm2 += a1 * B.load(k,j+1UL);
6591 C(i,j ) +=
sum( xmm1 ) * scalar;
6592 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6594 for( ; remainder && k<kend; ++k ) {
6595 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6596 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6602 const size_t kbegin( ( IsUpper_v<MT4> )
6603 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6604 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6606 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
6613 xmm1 += A.load(i,k) * B.load(k,j);
6616 C(i,j) +=
sum( xmm1 ) * scalar;
6618 for( ; remainder && k<K; ++k ) {
6619 C(i,j) += A(i,k) * B(k,j) * scalar;
6641 template<
typename MT3
6645 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6646 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6648 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6650 const size_t M( A.rows() );
6651 const size_t N( B.columns() );
6652 const size_t K( A.columns() );
6658 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6662 for( ; (j+2UL) <= N; j+=2UL )
6664 const size_t kbegin( ( IsUpper_v<MT4> )
6665 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6666 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6667 const size_t kend( ( IsLower_v<MT4> )
6668 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
6669 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6671 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6674 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6678 const SIMDType a1( A.load(i ,k) );
6679 const SIMDType a2( A.load(i+1UL,k) );
6680 const SIMDType a3( A.load(i+2UL,k) );
6681 const SIMDType a4( A.load(i+3UL,k) );
6682 const SIMDType b1( B.load(k,j ) );
6683 const SIMDType b2( B.load(k,j+1UL) );
6694 C(i ,j ) +=
sum( xmm1 ) * scalar;
6695 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6696 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6697 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6698 C(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6699 C(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6700 C(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6701 C(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6703 for( ; remainder && k<kend; ++k ) {
6704 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6705 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6706 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6707 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6708 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6709 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6710 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6711 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6717 const size_t kbegin( ( IsUpper_v<MT4> )
6718 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6719 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6720 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
6722 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6725 SIMDType xmm1, xmm2, xmm3, xmm4;
6729 const SIMDType b1( B.load(k,j) );
6730 xmm1 += A.load(i ,k) * b1;
6731 xmm2 += A.load(i+1UL,k) * b1;
6732 xmm3 += A.load(i+2UL,k) * b1;
6733 xmm4 += A.load(i+3UL,k) * b1;
6736 C(i ,j) +=
sum( xmm1 ) * scalar;
6737 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6738 C(i+2UL,j) +=
sum( xmm3 ) * scalar;
6739 C(i+3UL,j) +=
sum( xmm4 ) * scalar;
6741 for( ; remainder && k<kend; ++k ) {
6742 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6743 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6744 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6745 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6750 for( ; (i+2UL) <= M; i+=2UL )
6752 const size_t jend( LOW ? i+2UL : N );
6753 size_t j( UPP ? i : 0UL );
6755 for( ; (j+2UL) <= jend; j+=2UL )
6757 const size_t kbegin( ( IsUpper_v<MT4> )
6758 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6759 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6760 const size_t kend( ( IsLower_v<MT4> )
6761 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6762 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6764 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6767 SIMDType xmm1, xmm2, xmm3, xmm4;
6771 const SIMDType a1( A.load(i ,k) );
6772 const SIMDType a2( A.load(i+1UL,k) );
6773 const SIMDType b1( B.load(k,j ) );
6774 const SIMDType b2( B.load(k,j+1UL) );
6781 C(i ,j ) +=
sum( xmm1 ) * scalar;
6782 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6783 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6784 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6786 for( ; remainder && k<kend; ++k ) {
6787 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6788 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6789 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6790 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6796 const size_t kbegin( ( IsUpper_v<MT4> )
6797 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6798 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6799 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6801 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6804 SIMDType xmm1, xmm2;
6808 const SIMDType b1( B.load(k,j) );
6809 xmm1 += A.load(i ,k) * b1;
6810 xmm2 += A.load(i+1UL,k) * b1;
6813 C(i ,j) +=
sum( xmm1 ) * scalar;
6814 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6816 for( ; remainder && k<kend; ++k ) {
6817 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6818 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6825 const size_t jend( LOW ? i+1UL : N );
6826 size_t j( UPP ? i : 0UL );
6828 for( ; (j+2UL) <= jend; j+=2UL )
6830 const size_t kbegin( ( IsUpper_v<MT4> )
6831 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6832 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6833 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6835 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6838 SIMDType xmm1, xmm2;
6842 const SIMDType a1( A.load(i,k) );
6843 xmm1 += a1 * B.load(k,j );
6844 xmm2 += a1 * B.load(k,j+1UL);
6847 C(i,j ) +=
sum( xmm1 ) * scalar;
6848 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6850 for( ; remainder && k<kend; ++k ) {
6851 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6852 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6858 const size_t kbegin( ( IsUpper_v<MT4> )
6859 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6860 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6862 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
6869 xmm1 += A.load(i,k) * B.load(k,j);
6872 C(i,j) +=
sum( xmm1 ) * scalar;
6874 for( ; remainder && k<K; ++k ) {
6875 C(i,j) += A(i,k) * B(k,j) * scalar;
6896 template<
typename MT3
6900 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6901 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6903 selectDefaultAddAssignKernel( C, A, B, scalar );
6922 template<
typename MT3
6926 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6927 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6930 lmmm( C, A, B, scalar, ST2(1) );
6932 ummm( C, A, B, scalar, ST2(1) );
6934 mmm( C, A, B, scalar, ST2(1) );
6952 template<
typename MT3
6956 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6957 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6959 selectLargeAddAssignKernel( C, A, B, scalar );
6964 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6978 template<
typename MT3
6982 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6983 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6985 using ET = ElementType_t<MT3>;
6987 if( IsTriangular_v<MT4> ) {
6988 ResultType_t<MT3> tmp(
serial( B ) );
6989 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6990 addAssign( C, tmp );
6992 else if( IsTriangular_v<MT5> ) {
6993 ResultType_t<MT3> tmp(
serial( A ) );
6994 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6995 addAssign( C, tmp );
6998 gemm( C, A, B,
ET(scalar),
ET(1) );
7020 template<
typename MT
7022 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7029 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7030 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7032 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7046 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7061 template<
typename MT3
7065 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7067 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
7068 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
7069 selectSmallSubAssignKernel( C, A, B, scalar );
7071 selectBlasSubAssignKernel( C, A, B, scalar );
7089 template<
typename MT3
7093 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7094 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7097 subAssign( C, tmp );
7115 template<
typename MT3
7119 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7120 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7122 const size_t M( A.rows() );
7123 const size_t N( B.columns() );
7125 for(
size_t i=0UL; i<M; ++i )
7127 const size_t jbegin( ( IsUpper_v<MT4> )
7128 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7130 const size_t jend( ( IsLower_v<MT4> )
7131 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7135 const size_t jnum( jend - jbegin );
7136 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7138 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7139 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7140 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7143 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7163 template<
typename MT3
7167 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7168 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7170 constexpr
size_t block( BLOCK_SIZE );
7172 const size_t M( A.rows() );
7173 const size_t N( B.columns() );
7175 for(
size_t jj=0UL; jj<N; jj+=block ) {
7176 const size_t jend(
min( N, jj+block ) );
7177 for(
size_t ii=0UL; ii<M; ii+=block ) {
7178 const size_t iend(
min( M, ii+block ) );
7179 for(
size_t j=jj; j<jend; ++j )
7181 const size_t ibegin( ( IsLower_v<MT4> )
7182 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7184 const size_t ipos( ( IsUpper_v<MT4> )
7185 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7188 for(
size_t i=ibegin; i<ipos; ++i ) {
7189 C(i,j) -= A(i,j) * B(j,j) * scalar;
7212 template<
typename MT3
7216 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7217 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7219 constexpr
size_t block( BLOCK_SIZE );
7221 const size_t M( A.rows() );
7222 const size_t N( B.columns() );
7224 for(
size_t ii=0UL; ii<M; ii+=block ) {
7225 const size_t iend(
min( M, ii+block ) );
7226 for(
size_t jj=0UL; jj<N; jj+=block ) {
7227 const size_t jend(
min( N, jj+block ) );
7228 for(
size_t i=ii; i<iend; ++i )
7230 const size_t jbegin( ( IsUpper_v<MT5> )
7231 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7233 const size_t jpos( ( IsLower_v<MT5> )
7234 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7237 for(
size_t j=jbegin; j<jpos; ++j ) {
7238 C(i,j) -= A(i,i) * B(i,j) * scalar;
7261 template<
typename MT3
7265 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7266 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7268 const size_t M( A.rows() );
7269 const size_t N( B.columns() );
7271 for(
size_t j=0UL; j<N; ++j )
7273 const size_t ibegin( ( IsLower_v<MT5> )
7274 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7276 const size_t iend( ( IsUpper_v<MT5> )
7277 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7281 const size_t inum( iend - ibegin );
7282 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7284 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7285 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7286 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7289 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7309 template<
typename MT3
7313 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7314 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7316 for(
size_t i=0UL; i<A.rows(); ++i ) {
7317 C(i,i) -= A(i,i) * B(i,i) * scalar;
7336 template<
typename MT3
7340 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7341 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7343 selectDefaultSubAssignKernel( C, A, B, scalar );
7362 template<
typename MT3
7366 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7367 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7369 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7371 const size_t M( A.rows() );
7372 const size_t N( B.columns() );
7373 const size_t K( A.columns() );
7379 for( ; (i+2UL) <= M; i+=2UL )
7381 const size_t jend( LOW ? i+2UL : N );
7382 size_t j( UPP ? i : 0UL );
7384 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7386 const size_t kbegin( ( IsUpper_v<MT4> )
7387 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7388 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7389 const size_t kend( ( IsLower_v<MT4> )
7390 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
7391 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7393 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7396 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7400 const SIMDType a1( A.load(i ,k) );
7401 const SIMDType a2( A.load(i+1UL,k) );
7402 const SIMDType b1( B.load(k,j ) );
7403 const SIMDType b2( B.load(k,j+1UL) );
7404 const SIMDType b3( B.load(k,j+2UL) );
7405 const SIMDType b4( B.load(k,j+3UL) );
7416 C(i ,j ) -=
sum( xmm1 ) * scalar;
7417 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7418 C(i ,j+2UL) -=
sum( xmm3 ) * scalar;
7419 C(i ,j+3UL) -=
sum( xmm4 ) * scalar;
7420 C(i+1UL,j ) -=
sum( xmm5 ) * scalar;
7421 C(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
7422 C(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
7423 C(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
7425 for( ; remainder && k<kend; ++k ) {
7426 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7427 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7428 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7429 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7430 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7431 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7432 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7433 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7437 for( ; (j+2UL) <= jend; j+=2UL )
7439 const size_t kbegin( ( IsUpper_v<MT4> )
7440 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7441 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7442 const size_t kend( ( IsLower_v<MT4> )
7443 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7444 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7446 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7449 SIMDType xmm1, xmm2, xmm3, xmm4;
7453 const SIMDType a1( A.load(i ,k) );
7454 const SIMDType a2( A.load(i+1UL,k) );
7455 const SIMDType b1( B.load(k,j ) );
7456 const SIMDType b2( B.load(k,j+1UL) );
7463 C(i ,j ) -=
sum( xmm1 ) * scalar;
7464 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7465 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7466 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7468 for( ; remainder && k<kend; ++k ) {
7469 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7470 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7471 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7472 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7478 const size_t kbegin( ( IsUpper_v<MT4> )
7479 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7480 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7481 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7483 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7486 SIMDType xmm1, xmm2;
7490 const SIMDType b1( B.load(k,j) );
7491 xmm1 += A.load(i ,k) * b1;
7492 xmm2 += A.load(i+1UL,k) * b1;
7495 C(i ,j) -=
sum( xmm1 ) * scalar;
7496 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7498 for( ; remainder && k<kend; ++k ) {
7499 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7500 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7507 const size_t jend( LOW ? i+1UL : N );
7508 size_t j( UPP ? i : 0UL );
7510 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7512 const size_t kbegin( ( IsUpper_v<MT4> )
7513 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7514 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7515 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
7517 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7520 SIMDType xmm1, xmm2, xmm3, xmm4;
7524 const SIMDType a1( A.load(i,k) );
7525 xmm1 += a1 * B.load(k,j );
7526 xmm2 += a1 * B.load(k,j+1UL);
7527 xmm3 += a1 * B.load(k,j+2UL);
7528 xmm4 += a1 * B.load(k,j+3UL);
7531 C(i,j ) -=
sum( xmm1 ) * scalar;
7532 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7533 C(i,j+2UL) -=
sum( xmm3 ) * scalar;
7534 C(i,j+3UL) -=
sum( xmm4 ) * scalar;
7536 for( ; remainder && k<kend; ++k ) {
7537 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7538 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7539 C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7540 C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7544 for( ; (j+2UL) <= jend; j+=2UL )
7546 const size_t kbegin( ( IsUpper_v<MT4> )
7547 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7548 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7549 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7551 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7554 SIMDType xmm1, xmm2;
7558 const SIMDType a1( A.load(i,k) );
7559 xmm1 += a1 * B.load(k,j );
7560 xmm2 += a1 * B.load(k,j+1UL);
7563 C(i,j ) -=
sum( xmm1 ) * scalar;
7564 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7566 for( ; remainder && k<kend; ++k ) {
7567 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7568 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7574 const size_t kbegin( ( IsUpper_v<MT4> )
7575 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7576 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7578 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
7585 xmm1 += A.load(i,k) * B.load(k,j);
7588 C(i,j) -=
sum( xmm1 ) * scalar;
7590 for( ; remainder && k<K; ++k ) {
7591 C(i,j) -= A(i,k) * B(k,j) * scalar;
7613 template<
typename MT3
7617 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7618 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7620 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7622 const size_t M( A.rows() );
7623 const size_t N( B.columns() );
7624 const size_t K( A.columns() );
7630 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7634 for( ; (j+2UL) <= N; j+=2UL )
7636 const size_t kbegin( ( IsUpper_v<MT4> )
7637 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7638 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7639 const size_t kend( ( IsLower_v<MT4> )
7640 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
7641 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7643 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7646 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7651 const SIMDType a1( A.load(i ,k) );
7652 const SIMDType a2( A.load(i+1UL,k) );
7653 const SIMDType a3( A.load(i+2UL,k) );
7654 const SIMDType a4( A.load(i+3UL,k) );
7655 const SIMDType b1( B.load(k,j ) );
7656 const SIMDType b2( B.load(k,j+1UL) );
7667 C(i ,j ) -=
sum( xmm1 ) * scalar;
7668 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7669 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7670 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7671 C(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7672 C(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7673 C(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7674 C(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7676 for( ; remainder && k<kend; ++k ) {
7677 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7678 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7679 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7680 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7681 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7682 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7683 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7684 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7690 const size_t kbegin( ( IsUpper_v<MT4> )
7691 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7692 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7693 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
7695 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7698 SIMDType xmm1, xmm2, xmm3, xmm4;
7702 const SIMDType b1( B.load(k,j) );
7703 xmm1 += A.load(i ,k) * b1;
7704 xmm2 += A.load(i+1UL,k) * b1;
7705 xmm3 += A.load(i+2UL,k) * b1;
7706 xmm4 += A.load(i+3UL,k) * b1;
7709 C(i ,j) -=
sum( xmm1 ) * scalar;
7710 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7711 C(i+2UL,j) -=
sum( xmm3 ) * scalar;
7712 C(i+3UL,j) -=
sum( xmm4 ) * scalar;
7714 for( ; remainder && k<kend; ++k ) {
7715 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7716 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7717 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7718 C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7723 for( ; (i+2UL) <= M; i+=2UL )
7725 const size_t jend( LOW ? i+2UL : N );
7726 size_t j( UPP ? i : 0UL );
7728 for( ; (j+2UL) <= jend; j+=2UL )
7730 const size_t kbegin( ( IsUpper_v<MT4> )
7731 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7732 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7733 const size_t kend( ( IsLower_v<MT4> )
7734 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7735 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7737 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7740 SIMDType xmm1, xmm2, xmm3, xmm4;
7744 const SIMDType a1( A.load(i ,k) );
7745 const SIMDType a2( A.load(i+1UL,k) );
7746 const SIMDType b1( B.load(k,j ) );
7747 const SIMDType b2( B.load(k,j+1UL) );
7754 C(i ,j ) -=
sum( xmm1 ) * scalar;
7755 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7756 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7757 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7759 for( ; remainder && k<kend; ++k ) {
7760 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7761 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7762 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7763 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7769 const size_t kbegin( ( IsUpper_v<MT4> )
7770 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7771 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7772 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7774 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7777 SIMDType xmm1, xmm2;
7781 const SIMDType b1( B.load(k,j) );
7782 xmm1 += A.load(i ,k) * b1;
7783 xmm2 += A.load(i+1UL,k) * b1;
7786 C(i ,j) -=
sum( xmm1 ) * scalar;
7787 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7789 for( ; remainder && k<kend; ++k ) {
7790 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7791 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7798 const size_t jend( LOW ? i+1UL : N );
7799 size_t j( UPP ? i : 0UL );
7801 for( ; (j+2UL) <= jend; j+=2UL )
7803 const size_t kbegin( ( IsUpper_v<MT4> )
7804 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7805 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7806 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7808 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7811 SIMDType xmm1, xmm2;
7815 const SIMDType a1( A.load(i,k) );
7816 xmm1 += a1 * B.load(k,j );
7817 xmm2 += a1 * B.load(k,j+1UL);
7820 C(i,j ) -=
sum( xmm1 ) * scalar;
7821 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7823 for( ; remainder && k<kend; ++k ) {
7824 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7825 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7831 const size_t kbegin( ( IsUpper_v<MT4> )
7832 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7833 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7835 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
7842 xmm1 += A.load(i,k) * B.load(k,j);
7845 C(i,j) -=
sum( xmm1 ) * scalar;
7847 for( ; remainder && k<K; ++k ) {
7848 C(i,j) -= A(i,k) * B(k,j) * scalar;
7869 template<
typename MT3
7873 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7874 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7876 selectDefaultSubAssignKernel( C, A, B, scalar );
7895 template<
typename MT3
7899 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7900 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7903 lmmm( C, A, B, -scalar, ST2(1) );
7905 ummm( C, A, B, -scalar, ST2(1) );
7907 mmm( C, A, B, -scalar, ST2(1) );
7925 template<
typename MT3
7929 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7930 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7932 selectLargeSubAssignKernel( C, A, B, scalar );
7937 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7951 template<
typename MT3
7955 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7956 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7958 using ET = ElementType_t<MT3>;
7960 if( IsTriangular_v<MT4> ) {
7961 ResultType_t<MT3> tmp(
serial( B ) );
7962 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7963 subAssign( C, tmp );
7965 else if( IsTriangular_v<MT5> ) {
7966 ResultType_t<MT3> tmp(
serial( A ) );
7967 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7968 subAssign( C, tmp );
7971 gemm( C, A, B,
ET(-scalar),
ET(1) );
7993 template<
typename MT
7995 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8007 schurAssign( ~lhs, tmp );
8038 template<
typename MT
8041 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8048 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8049 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8051 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8054 else if( left.columns() == 0UL ) {
8088 template<
typename MT
8091 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8095 using TmpType = If_t< SO, OppositeType, ResultType >;
8107 const ForwardFunctor fwd;
8109 const TmpType tmp( rhs );
8129 template<
typename MT
8132 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8139 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8140 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8142 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8179 template<
typename MT
8182 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8189 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8190 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8192 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8226 template<
typename MT
8309 template<
typename MT1
8311 inline decltype(
auto)
8321 return ReturnType( ~lhs, ~rhs );
8359 template<
typename MT1
8365 inline decltype(
auto)
declsym( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8373 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8374 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8405 template<
typename MT1
8411 inline decltype(
auto)
declherm( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8419 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
8420 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8451 template<
typename MT1
8457 inline decltype(
auto)
decllow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8465 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
8466 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8497 template<
typename MT1
8503 inline decltype(
auto)
declupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8511 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
8512 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8543 template<
typename MT1
8549 inline decltype(
auto)
decldiag( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8557 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
8558 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8574 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8575 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
8576 :
public Size<MT1,0UL>
8579 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8580 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
8581 :
public Size<MT2,1UL>
8597 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8598 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8599 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:287
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:171
Header file for basic type definitions.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:169
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:270
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:373
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:265
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:273
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:475
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:389
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:309
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:443
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2147
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:168
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:300
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:279
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:431
Header file for the IsAligned type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:269
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:267
Header file for the exception macros of the math module.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:453
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:263
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:164
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:419
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:294
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:409
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:282
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:268
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:324
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:170
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:143
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:399
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:276
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:463
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:159
Header file for the IsExpression type trait class.
Header file for the function trace functionality.