35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_ 138 template<
typename MT1
145 :
public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
160 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
165 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
169 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
170 static constexpr
bool HERM = ( HF && !( LF || UF ) );
171 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
172 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
181 template<
typename T1,
typename T2,
typename T3 >
191 template<
typename T1,
typename T2,
typename T3 >
192 static constexpr
bool UseBlasKernel_v =
195 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
196 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
197 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
198 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
199 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
200 IsBLASCompatible_v< ElementType_t<T1> > &&
201 IsBLASCompatible_v< ElementType_t<T2> > &&
202 IsBLASCompatible_v< ElementType_t<T3> > &&
213 template<
typename T1,
typename T2,
typename T3 >
214 static constexpr
bool UseVectorizedDefaultKernel_v =
215 ( useOptimizedKernels &&
216 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
217 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
218 IsSIMDCombinable_v< ElementType_t<T1>
289 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
290 MT1::simdEnabled && MT2::simdEnabled &&
291 HasSIMDAdd_v<ET1,ET2> &&
292 HasSIMDMult_v<ET1,ET2> );
329 if( IsDiagonal_v<MT1> ) {
332 else if( IsDiagonal_v<MT2> ) {
335 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
336 const size_t begin( ( IsUpper_v<MT1> )
337 ?( ( IsLower_v<MT2> )
338 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
339 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
340 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
341 :( ( IsLower_v<MT2> )
342 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
344 const size_t end( ( IsLower_v<MT1> )
345 ?( ( IsUpper_v<MT2> )
346 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
347 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
348 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
349 :( ( IsUpper_v<MT2> )
350 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
351 :(
lhs_.columns() ) ) );
375 if( i >=
lhs_.rows() ) {
378 if( j >=
rhs_.columns() ) {
390 inline size_t rows() const noexcept {
401 return rhs_.columns();
431 template<
typename T >
432 inline bool canAlias(
const T* alias )
const noexcept {
433 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
443 template<
typename T >
444 inline bool isAliased(
const T* alias )
const noexcept {
445 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
455 return lhs_.isAligned() &&
rhs_.isAligned();
466 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
468 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
469 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
470 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
493 template<
typename MT
502 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
505 else if( rhs.
lhs_.columns() == 0UL ) {
520 DMatTDMatMultExpr::selectAssignKernel( ~lhs, A, B );
536 template<
typename MT3
539 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
541 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
542 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
543 selectSmallAssignKernel( C, A, B );
545 selectBlasAssignKernel( C, A, B );
564 template<
typename MT3
567 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
568 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
570 const size_t M( A.rows() );
571 const size_t N( B.columns() );
572 const size_t K( A.columns() );
576 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
577 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
579 const size_t iend( ( IsStrictlyUpper_v<MT4> )
580 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
584 for(
size_t i=0UL; i<ibegin; ++i ) {
585 for(
size_t j=0UL; j<N; ++j ) {
589 for(
size_t i=ibegin; i<iend; ++i )
591 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
592 ?( ( IsStrictlyUpper_v<MT4> )
593 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
594 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
595 :( ( IsStrictlyUpper_v<MT5> )
598 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
599 ?( ( IsStrictlyLower_v<MT4> )
600 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
601 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
602 :( ( IsStrictlyLower_v<MT5> )
603 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
604 :(
LOW ? i+1UL : N ) ) );
607 for(
size_t j=0UL; j<N; ++j ) {
615 for(
size_t j=(
SYM ||
HERM ? i : 0UL ); j<jbegin; ++j ) {
618 for(
size_t j=jbegin; j<jend; ++j )
620 const size_t kbegin( ( IsUpper_v<MT4> )
621 ?( ( IsLower_v<MT5> )
622 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
623 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
624 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
625 :( ( IsLower_v<MT5> )
626 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
628 const size_t kend( ( IsLower_v<MT4> )
629 ?( ( IsUpper_v<MT5> )
630 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
631 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
632 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
633 :( ( IsUpper_v<MT5> )
634 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
638 C(i,j) = A(i,kbegin) * B(kbegin,j);
639 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
640 C(i,j) += A(i,k) * B(k,j);
643 for(
size_t j=jend; j<N; ++j ) {
647 for(
size_t i=iend; i<M; ++i ) {
648 for(
size_t j=0UL; j<N; ++j ) {
654 for(
size_t i=1UL; i<M; ++i ) {
655 for(
size_t j=0UL; j<i; ++j ) {
656 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
678 template<
typename MT3
681 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
682 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
684 const size_t M( A.rows() );
685 const size_t N( B.columns() );
686 const size_t K( A.columns() );
690 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
691 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
693 const size_t jend( ( IsStrictlyLower_v<MT5> )
694 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
698 for(
size_t j=0UL; j<jbegin; ++j ) {
699 for(
size_t i=0UL; i<M; ++i ) {
703 for(
size_t j=jbegin; j<jend; ++j )
705 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
706 ?( ( IsStrictlyLower_v<MT4> )
707 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
708 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
709 :( ( IsStrictlyLower_v<MT4> )
712 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
713 ?( ( IsStrictlyUpper_v<MT4> )
714 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
715 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
716 :( ( IsStrictlyUpper_v<MT4> )
717 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
718 :(
UPP ? j+1UL : M ) ) );
721 for(
size_t i=0UL; i<M; ++i ) {
729 for(
size_t i=(
SYM ||
HERM ? j : 0UL ); i<ibegin; ++i ) {
732 for(
size_t i=ibegin; i<iend; ++i )
734 const size_t kbegin( ( IsUpper_v<MT4> )
735 ?( ( IsLower_v<MT5> )
736 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
737 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
738 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
739 :( ( IsLower_v<MT5> )
740 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
742 const size_t kend( ( IsLower_v<MT4> )
743 ?( ( IsUpper_v<MT5> )
744 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
745 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
746 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
747 :( ( IsUpper_v<MT5> )
748 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
752 C(i,j) = A(i,kbegin) * B(kbegin,j);
753 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
754 C(i,j) += A(i,k) * B(k,j);
757 for(
size_t i=iend; i<M; ++i ) {
761 for(
size_t j=jend; j<N; ++j ) {
762 for(
size_t i=0UL; i<M; ++i ) {
768 for(
size_t j=1UL; j<N; ++j ) {
769 for(
size_t i=0UL; i<j; ++i ) {
770 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
792 template<
typename MT3
795 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
796 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
798 const size_t M( A.rows() );
799 const size_t N( B.columns() );
801 for(
size_t i=0UL; i<M; ++i )
803 const size_t jbegin( ( IsUpper_v<MT4> )
804 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
806 const size_t jend( ( IsLower_v<MT4> )
807 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
811 if( IsUpper_v<MT4> ) {
812 for(
size_t j=0UL; j<jbegin; ++j ) {
816 for(
size_t j=jbegin; j<jend; ++j ) {
817 C(i,j) = A(i,j) * B(j,j);
819 if( IsLower_v<MT4> ) {
820 for(
size_t j=jend; j<N; ++j ) {
843 template<
typename MT3
846 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
847 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
849 constexpr
size_t block( BLOCK_SIZE );
851 const size_t M( A.rows() );
852 const size_t N( B.columns() );
854 for(
size_t jj=0UL; jj<N; jj+=block ) {
855 const size_t jend(
min( N, jj+block ) );
856 for(
size_t ii=0UL; ii<M; ii+=block ) {
857 const size_t iend(
min( M, ii+block ) );
858 for(
size_t j=jj; j<jend; ++j )
860 const size_t ibegin( ( IsLower_v<MT4> )
861 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
863 const size_t ipos( ( IsUpper_v<MT4> )
864 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
867 if( IsLower_v<MT4> ) {
868 for(
size_t i=ii; i<ibegin; ++i ) {
872 for(
size_t i=ibegin; i<ipos; ++i ) {
873 C(i,j) = A(i,j) * B(j,j);
875 if( IsUpper_v<MT4> ) {
876 for(
size_t i=ipos; i<iend; ++i ) {
901 template<
typename MT3
904 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
905 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
907 constexpr
size_t block( BLOCK_SIZE );
909 const size_t M( A.rows() );
910 const size_t N( B.columns() );
912 for(
size_t ii=0UL; ii<M; ii+=block ) {
913 const size_t iend(
min( M, ii+block ) );
914 for(
size_t jj=0UL; jj<N; jj+=block ) {
915 const size_t jend(
min( N, jj+block ) );
916 for(
size_t i=ii; i<iend; ++i )
918 const size_t jbegin( ( IsUpper_v<MT5> )
919 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
921 const size_t jpos( ( IsLower_v<MT5> )
922 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
925 if( IsUpper_v<MT5> ) {
926 for(
size_t j=jj; j<jbegin; ++j ) {
930 for(
size_t j=jbegin; j<jpos; ++j ) {
931 C(i,j) = A(i,i) * B(i,j);
933 if( IsLower_v<MT5> ) {
934 for(
size_t j=jpos; j<jend; ++j ) {
959 template<
typename MT3
962 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
963 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
965 const size_t M( A.rows() );
966 const size_t N( B.columns() );
968 for(
size_t j=0UL; j<N; ++j )
970 const size_t ibegin( ( IsLower_v<MT5> )
971 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
973 const size_t iend( ( IsUpper_v<MT5> )
974 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
978 if( IsLower_v<MT5> ) {
979 for(
size_t i=0UL; i<ibegin; ++i ) {
983 for(
size_t i=ibegin; i<iend; ++i ) {
984 C(i,j) = A(i,i) * B(i,j);
986 if( IsUpper_v<MT5> ) {
987 for(
size_t i=iend; i<M; ++i ) {
1010 template<
typename MT3
1013 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1014 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1018 for(
size_t i=0UL; i<A.rows(); ++i ) {
1019 C(i,i) = A(i,i) * B(i,i);
1039 template<
typename MT3
1042 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1043 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1045 selectDefaultAssignKernel( C, A, B );
1065 template<
typename MT3
1068 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1069 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1071 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1073 const size_t M( A.rows() );
1074 const size_t N( B.columns() );
1075 const size_t K( A.columns() );
1086 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1088 const size_t jend(
LOW ? i+2UL : N );
1091 for( ; (j+4UL) <= jend; j+=4UL )
1093 const size_t kbegin( ( IsUpper_v<MT4> )
1094 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1095 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1096 const size_t kend( ( IsLower_v<MT4> )
1097 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1098 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1100 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1103 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1108 const SIMDType a2( A.load(i+1UL,k) );
1110 const SIMDType b2( B.load(k,j+1UL) );
1111 const SIMDType b3( B.load(k,j+2UL) );
1112 const SIMDType b4( B.load(k,j+3UL) );
1123 C(i ,j ) =
sum( xmm1 );
1124 C(i ,j+1UL) =
sum( xmm2 );
1125 C(i ,j+2UL) =
sum( xmm3 );
1126 C(i ,j+3UL) =
sum( xmm4 );
1127 C(i+1UL,j ) =
sum( xmm5 );
1128 C(i+1UL,j+1UL) =
sum( xmm6 );
1129 C(i+1UL,j+2UL) =
sum( xmm7 );
1130 C(i+1UL,j+3UL) =
sum( xmm8 );
1132 for( ; remainder && k<kend; ++k ) {
1133 C(i ,j ) += A(i ,k) * B(k,j );
1134 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1135 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1136 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1137 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1138 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1139 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1140 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1144 for( ; (j+2UL) <= jend; j+=2UL )
1146 const size_t kbegin( ( IsUpper_v<MT4> )
1147 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1148 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1149 const size_t kend( ( IsLower_v<MT4> )
1150 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1151 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1153 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1161 const SIMDType a2( A.load(i+1UL,k) );
1163 const SIMDType b2( B.load(k,j+1UL) );
1170 C(i ,j ) =
sum( xmm1 );
1171 C(i ,j+1UL) =
sum( xmm2 );
1172 C(i+1UL,j ) =
sum( xmm3 );
1173 C(i+1UL,j+1UL) =
sum( xmm4 );
1175 for( ; remainder && k<kend; ++k ) {
1176 C(i ,j ) += A(i ,k) * B(k,j );
1177 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1178 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1179 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1185 const size_t kbegin( ( IsUpper_v<MT4> )
1186 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1187 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1188 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1190 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1198 xmm1 += A.load(i ,k) * b1;
1199 xmm2 += A.load(i+1UL,k) * b1;
1202 C(i ,j) =
sum( xmm1 );
1203 C(i+1UL,j) =
sum( xmm2 );
1205 for( ; remainder && k<kend; ++k ) {
1206 C(i ,j) += A(i ,k) * B(k,j);
1207 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1214 const size_t jend(
LOW ? i+1UL : N );
1217 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
1219 const size_t kbegin( ( IsUpper_v<MT4> )
1220 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1221 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1222 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1224 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1232 xmm1 += a1 * B.load(k,j );
1233 xmm2 += a1 * B.load(k,j+1UL);
1234 xmm3 += a1 * B.load(k,j+2UL);
1235 xmm4 += a1 * B.load(k,j+3UL);
1238 C(i,j ) =
sum( xmm1 );
1239 C(i,j+1UL) =
sum( xmm2 );
1240 C(i,j+2UL) =
sum( xmm3 );
1241 C(i,j+3UL) =
sum( xmm4 );
1243 for( ; remainder && k<kend; ++k ) {
1244 C(i,j ) += A(i,k) * B(k,j );
1245 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1246 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1247 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1251 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1253 const size_t kbegin( ( IsUpper_v<MT4> )
1254 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1255 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1256 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1258 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1266 xmm1 += a1 * B.load(k,j );
1267 xmm2 += a1 * B.load(k,j+1UL);
1270 C(i,j ) =
sum( xmm1 );
1271 C(i,j+1UL) =
sum( xmm2 );
1273 for( ; remainder && k<kend; ++k ) {
1274 C(i,j ) += A(i,k) * B(k,j );
1275 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1279 for( ; j<jend; ++j )
1281 const size_t kbegin( ( IsUpper_v<MT4> )
1282 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1283 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1285 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
1292 xmm1 += A.load(i,k) * B.load(k,j);
1295 C(i,j) =
sum( xmm1 );
1297 for( ; remainder && k<K; ++k ) {
1298 C(i,j) += A(i,k) * B(k,j);
1305 for(
size_t i=2UL; i<M; ++i ) {
1306 const size_t jend( 2UL * ( i/2UL ) );
1307 for(
size_t j=0UL; j<jend; ++j ) {
1308 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1313 for(
size_t j=2UL; j<N; ++j ) {
1314 const size_t iend( 2UL * ( j/2UL ) );
1315 for(
size_t i=0UL; i<iend; ++i ) {
1321 for(
size_t i=2UL; i<M; ++i ) {
1322 const size_t jend( 2UL * ( i/2UL ) );
1323 for(
size_t j=0UL; j<jend; ++j ) {
1347 template<
typename MT3
1350 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1351 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1353 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1355 const size_t M( A.rows() );
1356 const size_t N( B.columns() );
1357 const size_t K( A.columns() );
1368 for( ; !(
LOW &&
UPP ) && (i+4UL) <= M; i+=4UL )
1370 const size_t jend(
SYM ||
HERM ||
LOW ? i+4UL : N );
1371 size_t j(
UPP ? i : 0UL );
1373 for( ; (j+2UL) <= jend; j+=2UL )
1375 const size_t kbegin( ( IsUpper_v<MT4> )
1376 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1377 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1378 const size_t kend( ( IsLower_v<MT4> )
1379 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1380 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1382 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1385 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1390 const SIMDType a2( A.load(i+1UL,k) );
1391 const SIMDType a3( A.load(i+2UL,k) );
1392 const SIMDType a4( A.load(i+3UL,k) );
1394 const SIMDType b2( B.load(k,j+1UL) );
1405 C(i ,j ) =
sum( xmm1 );
1406 C(i ,j+1UL) =
sum( xmm2 );
1407 C(i+1UL,j ) =
sum( xmm3 );
1408 C(i+1UL,j+1UL) =
sum( xmm4 );
1409 C(i+2UL,j ) =
sum( xmm5 );
1410 C(i+2UL,j+1UL) =
sum( xmm6 );
1411 C(i+3UL,j ) =
sum( xmm7 );
1412 C(i+3UL,j+1UL) =
sum( xmm8 );
1414 for( ; remainder && k<kend; ++k ) {
1415 C(i ,j ) += A(i ,k) * B(k,j );
1416 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1417 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1418 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1419 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1420 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1421 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1422 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1428 const size_t kbegin( ( IsUpper_v<MT4> )
1429 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1430 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1431 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
1433 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1441 xmm1 += A.load(i ,k) * b1;
1442 xmm2 += A.load(i+1UL,k) * b1;
1443 xmm3 += A.load(i+2UL,k) * b1;
1444 xmm4 += A.load(i+3UL,k) * b1;
1447 C(i ,j) =
sum( xmm1 );
1448 C(i+1UL,j) =
sum( xmm2 );
1449 C(i+2UL,j) =
sum( xmm3 );
1450 C(i+3UL,j) =
sum( xmm4 );
1452 for( ; remainder && k<kend; ++k ) {
1453 C(i ,j) += A(i ,k) * B(k,j);
1454 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1455 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1456 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
1461 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1465 for( ; (j+2UL) <= N; j+=2UL )
1467 const size_t kbegin( ( IsUpper_v<MT4> )
1468 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1469 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1470 const size_t kend( ( IsLower_v<MT4> )
1471 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1472 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1474 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1482 const SIMDType a2( A.load(i+1UL,k) );
1484 const SIMDType b2( B.load(k,j+1UL) );
1491 C(i ,j ) =
sum( xmm1 );
1492 C(i ,j+1UL) =
sum( xmm2 );
1493 C(i+1UL,j ) =
sum( xmm3 );
1494 C(i+1UL,j+1UL) =
sum( xmm4 );
1496 for( ; remainder && k<kend; ++k ) {
1497 C(i ,j ) += A(i ,k) * B(k,j );
1498 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1499 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1500 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1506 const size_t kbegin( ( IsUpper_v<MT4> )
1507 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1508 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1509 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1511 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1519 xmm1 += A.load(i ,k) * b1;
1520 xmm2 += A.load(i+1UL,k) * b1;
1523 C(i ,j) =
sum( xmm1 );
1524 C(i+1UL,j) =
sum( xmm2 );
1526 for( ; remainder && k<kend; ++k ) {
1527 C(i ,j) += A(i ,k) * B(k,j);
1528 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1535 const size_t jend(
LOW &&
UPP ? i+1UL : N );
1536 size_t j(
LOW &&
UPP ? i : 0UL );
1538 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1540 const size_t kbegin( ( IsUpper_v<MT4> )
1541 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1542 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1543 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1545 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
1553 xmm1 += a1 * B.load(k,j );
1554 xmm2 += a1 * B.load(k,j+1UL);
1557 C(i,j ) =
sum( xmm1 );
1558 C(i,j+1UL) =
sum( xmm2 );
1560 for( ; remainder && k<kend; ++k ) {
1561 C(i,j ) += A(i,k) * B(k,j );
1562 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1566 for( ; j<jend; ++j )
1568 const size_t kbegin( ( IsUpper_v<MT4> )
1569 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
1570 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
1572 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
1579 xmm1 += A.load(i,k) * B.load(k,j);
1582 C(i,j) =
sum( xmm1 );
1584 for( ; remainder && k<K; ++k ) {
1585 C(i,j) += A(i,k) * B(k,j);
1591 if( (
SYM ||
HERM ) && ( N > 4UL ) ) {
1592 for(
size_t j=4UL; j<N; ++j ) {
1593 const size_t iend( 4UL * ( j/4UL ) );
1594 for(
size_t i=0UL; i<iend; ++i ) {
1595 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1600 for(
size_t j=4UL; j<N; ++j ) {
1601 const size_t iend( 4UL * ( j/4UL ) );
1602 for(
size_t i=0UL; i<iend; ++i ) {
1608 for(
size_t i=4UL; i<N; ++i ) {
1609 const size_t jend( 4UL * ( i/4UL ) );
1610 for(
size_t j=0UL; j<jend; ++j ) {
1633 template<
typename MT3
1636 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1637 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1639 selectDefaultAssignKernel( C, A, B );
1659 template<
typename MT3
1662 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1663 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1693 template<
typename MT3
1696 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1697 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1699 selectLargeAssignKernel( C, A, B );
1705 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1719 template<
typename MT3
1722 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1723 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1725 using ET = ElementType_t<MT3>;
1727 if( IsTriangular_v<MT4> ) {
1729 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1731 else if( IsTriangular_v<MT5> ) {
1733 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1736 gemm( C, A, B, ET(1), ET(0) );
1756 template<
typename MT
1758 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1762 using TmpType = If_t< SO, OppositeType, ResultType >;
1774 const ForwardFunctor fwd;
1776 const TmpType tmp(
serial( rhs ) );
1777 assign( ~lhs, fwd( tmp ) );
1795 template<
typename MT
1797 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
1804 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1818 DMatTDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1834 template<
typename MT3
1837 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1839 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
1840 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
1841 selectSmallAddAssignKernel( C, A, B );
1843 selectBlasAddAssignKernel( C, A, B );
1862 template<
typename MT3
1865 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1866 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1868 const size_t M( A.rows() );
1869 const size_t N( B.columns() );
1870 const size_t K( A.columns() );
1874 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
1875 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
1877 const size_t iend( ( IsStrictlyUpper_v<MT4> )
1878 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
1882 for(
size_t i=ibegin; i<iend; ++i )
1884 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1885 ?( ( IsStrictlyUpper_v<MT4> )
1886 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
1887 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
1888 :( ( IsStrictlyUpper_v<MT5> )
1889 ?(
UPP ?
max( i, 1UL ) : 1UL )
1890 :(
UPP ? i : 0UL ) ) );
1891 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
1892 ?( ( IsStrictlyLower_v<MT4> )
1893 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
1894 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
1895 :( ( IsStrictlyLower_v<MT5> )
1896 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
1897 :(
LOW ? i+1UL : N ) ) );
1899 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
1902 for(
size_t j=jbegin; j<jend; ++j )
1904 const size_t kbegin( ( IsUpper_v<MT4> )
1905 ?( ( IsLower_v<MT5> )
1906 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1907 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1908 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1909 :( ( IsLower_v<MT5> )
1910 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
1912 const size_t kend( ( IsLower_v<MT4> )
1913 ?( ( IsUpper_v<MT5> )
1914 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
1915 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
1916 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
1917 :( ( IsUpper_v<MT5> )
1918 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
1922 const size_t knum( kend - kbegin );
1923 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
1925 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
1926 C(i,j) += A(i,k ) * B(k ,j);
1927 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
1930 C(i,j) += A(i,kpos) * B(kpos,j);
1952 template<
typename MT3
1955 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1956 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1958 const size_t M( A.rows() );
1959 const size_t N( B.columns() );
1960 const size_t K( A.columns() );
1964 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
1965 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
1967 const size_t jend( ( IsStrictlyLower_v<MT5> )
1968 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
1972 for(
size_t j=jbegin; j<jend; ++j )
1974 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
1975 ?( ( IsStrictlyLower_v<MT4> )
1976 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
1977 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1978 :( ( IsStrictlyLower_v<MT4> )
1979 ?(
LOW ?
max( j, 1UL ) : 1UL )
1980 :(
LOW ? j : 0UL ) ) );
1981 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
1982 ?( ( IsStrictlyUpper_v<MT4> )
1983 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
1984 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
1985 :( ( IsStrictlyUpper_v<MT4> )
1986 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
1987 :(
UPP ? j+1UL : M ) ) );
1989 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
1992 for(
size_t i=ibegin; i<iend; ++i )
1994 const size_t kbegin( ( IsUpper_v<MT4> )
1995 ?( ( IsLower_v<MT5> )
1996 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1997 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1998 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1999 :( ( IsLower_v<MT5> )
2000 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2002 const size_t kend( ( IsLower_v<MT4> )
2003 ?( ( IsUpper_v<MT5> )
2004 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2005 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2006 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2007 :( ( IsUpper_v<MT5> )
2008 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2012 const size_t knum( kend - kbegin );
2013 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
2015 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2016 C(i,j) += A(i,k ) * B(k ,j);
2017 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2020 C(i,j) += A(i,kpos) * B(kpos,j);
2042 template<
typename MT3
2045 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2046 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2048 const size_t M( A.rows() );
2049 const size_t N( B.columns() );
2051 for(
size_t i=0UL; i<M; ++i )
2053 const size_t jbegin( ( IsUpper_v<MT4> )
2054 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2056 const size_t jend( ( IsLower_v<MT4> )
2057 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2061 const size_t jnum( jend - jbegin );
2062 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2064 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2065 C(i,j ) += A(i,j ) * B(j ,j );
2066 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2069 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2090 template<
typename MT3
2093 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2094 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2096 constexpr
size_t block( BLOCK_SIZE );
2098 const size_t M( A.rows() );
2099 const size_t N( B.columns() );
2101 for(
size_t jj=0UL; jj<N; jj+=block ) {
2102 const size_t jend(
min( N, jj+block ) );
2103 for(
size_t ii=0UL; ii<M; ii+=block ) {
2104 const size_t iend(
min( M, ii+block ) );
2105 for(
size_t j=jj; j<jend; ++j )
2107 const size_t ibegin( ( IsLower_v<MT4> )
2108 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
2110 const size_t ipos( ( IsUpper_v<MT4> )
2111 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
2114 for(
size_t i=ibegin; i<ipos; ++i ) {
2115 C(i,j) += A(i,j) * B(j,j);
2138 template<
typename MT3
2141 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2142 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2144 constexpr
size_t block( BLOCK_SIZE );
2146 const size_t M( A.rows() );
2147 const size_t N( B.columns() );
2149 for(
size_t ii=0UL; ii<M; ii+=block ) {
2150 const size_t iend(
min( M, ii+block ) );
2151 for(
size_t jj=0UL; jj<N; jj+=block ) {
2152 const size_t jend(
min( N, jj+block ) );
2153 for(
size_t i=ii; i<iend; ++i )
2155 const size_t jbegin( ( IsUpper_v<MT5> )
2156 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
2158 const size_t jpos( ( IsLower_v<MT5> )
2159 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
2162 for(
size_t j=jbegin; j<jpos; ++j ) {
2163 C(i,j) += A(i,i) * B(i,j);
2186 template<
typename MT3
2189 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2190 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2192 const size_t M( A.rows() );
2193 const size_t N( B.columns() );
2195 for(
size_t j=0UL; j<N; ++j )
2197 const size_t ibegin( ( IsLower_v<MT5> )
2198 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2200 const size_t iend( ( IsUpper_v<MT5> )
2201 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2205 const size_t inum( iend - ibegin );
2206 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2208 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2209 C(i ,j) += A(i ,i ) * B(i ,j);
2210 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2213 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2234 template<
typename MT3
2237 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2238 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2240 for(
size_t i=0UL; i<A.rows(); ++i ) {
2241 C(i,i) += A(i,i) * B(i,i);
2261 template<
typename MT3
2264 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2265 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2267 selectDefaultAddAssignKernel( C, A, B );
2287 template<
typename MT3
2290 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2291 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2293 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2295 const size_t M( A.rows() );
2296 const size_t N( B.columns() );
2297 const size_t K( A.columns() );
2303 for( ; (i+2UL) <= M; i+=2UL )
2305 const size_t jend(
LOW ? i+2UL : N );
2306 size_t j(
UPP ? i : 0UL );
2308 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2310 const size_t kbegin( ( IsUpper_v<MT4> )
2311 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2312 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2313 const size_t kend( ( IsLower_v<MT4> )
2314 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
2315 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
2317 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2320 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2325 const SIMDType a2( A.load(i+1UL,k) );
2327 const SIMDType b2( B.load(k,j+1UL) );
2328 const SIMDType b3( B.load(k,j+2UL) );
2329 const SIMDType b4( B.load(k,j+3UL) );
2340 C(i ,j ) +=
sum( xmm1 );
2341 C(i ,j+1UL) +=
sum( xmm2 );
2342 C(i ,j+2UL) +=
sum( xmm3 );
2343 C(i ,j+3UL) +=
sum( xmm4 );
2344 C(i+1UL,j ) +=
sum( xmm5 );
2345 C(i+1UL,j+1UL) +=
sum( xmm6 );
2346 C(i+1UL,j+2UL) +=
sum( xmm7 );
2347 C(i+1UL,j+3UL) +=
sum( xmm8 );
2349 for( ; remainder && k<kend; ++k ) {
2350 C(i ,j ) += A(i ,k) * B(k,j );
2351 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2352 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2353 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
2354 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2355 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2356 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2357 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
2361 for( ; (j+2UL) <= jend; j+=2UL )
2363 const size_t kbegin( ( IsUpper_v<MT4> )
2364 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2365 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2366 const size_t kend( ( IsLower_v<MT4> )
2367 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2368 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2370 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2378 const SIMDType a2( A.load(i+1UL,k) );
2380 const SIMDType b2( B.load(k,j+1UL) );
2387 C(i ,j ) +=
sum( xmm1 );
2388 C(i ,j+1UL) +=
sum( xmm2 );
2389 C(i+1UL,j ) +=
sum( xmm3 );
2390 C(i+1UL,j+1UL) +=
sum( xmm4 );
2392 for( ; remainder && k<kend; ++k ) {
2393 C(i ,j ) += A(i ,k) * B(k,j );
2394 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2395 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2396 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2402 const size_t kbegin( ( IsUpper_v<MT4> )
2403 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2404 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2405 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2407 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2415 xmm1 += A.load(i ,k) * b1;
2416 xmm2 += A.load(i+1UL,k) * b1;
2419 C(i ,j) +=
sum( xmm1 );
2420 C(i+1UL,j) +=
sum( xmm2 );
2422 for( ; remainder && k<kend; ++k ) {
2423 C(i ,j) += A(i ,k) * B(k,j);
2424 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2431 const size_t jend(
LOW ? i+1UL : N );
2432 size_t j(
UPP ? i : 0UL );
2434 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
2436 const size_t kbegin( ( IsUpper_v<MT4> )
2437 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2438 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2439 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
2441 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2449 xmm1 += a1 * B.load(k,j );
2450 xmm2 += a1 * B.load(k,j+1UL);
2451 xmm3 += a1 * B.load(k,j+2UL);
2452 xmm4 += a1 * B.load(k,j+3UL);
2455 C(i,j ) +=
sum( xmm1 );
2456 C(i,j+1UL) +=
sum( xmm2 );
2457 C(i,j+2UL) +=
sum( xmm3 );
2458 C(i,j+3UL) +=
sum( xmm4 );
2460 for( ; remainder && k<kend; ++k ) {
2461 C(i,j ) += A(i,k) * B(k,j );
2462 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2463 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
2464 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
2468 for( ; (j+2UL) <= jend; j+=2UL )
2470 const size_t kbegin( ( IsUpper_v<MT4> )
2471 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2472 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2473 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2475 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2483 xmm1 += a1 * B.load(k,j );
2484 xmm2 += a1 * B.load(k,j+1UL);
2487 C(i,j ) +=
sum( xmm1 );
2488 C(i,j+1UL) +=
sum( xmm2 );
2490 for( ; remainder && k<kend; ++k ) {
2491 C(i,j ) += A(i,k) * B(k,j );
2492 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2498 const size_t kbegin( ( IsUpper_v<MT4> )
2499 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2500 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2502 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
2509 xmm1 += A.load(i,k) * B.load(k,j);
2512 C(i,j) +=
sum( xmm1 );
2514 for( ; remainder && k<K; ++k ) {
2515 C(i,j) += A(i,k) * B(k,j);
2538 template<
typename MT3
2541 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2542 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2544 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
2546 const size_t M( A.rows() );
2547 const size_t N( B.columns() );
2548 const size_t K( A.columns() );
2554 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
2558 for( ; (j+2UL) <= N; j+=2UL )
2560 const size_t kbegin( ( IsUpper_v<MT4> )
2561 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2562 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2563 const size_t kend( ( IsLower_v<MT4> )
2564 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
2565 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2567 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2570 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2575 const SIMDType a2( A.load(i+1UL,k) );
2576 const SIMDType a3( A.load(i+2UL,k) );
2577 const SIMDType a4( A.load(i+3UL,k) );
2579 const SIMDType b2( B.load(k,j+1UL) );
2590 C(i ,j ) +=
sum( xmm1 );
2591 C(i ,j+1UL) +=
sum( xmm2 );
2592 C(i+1UL,j ) +=
sum( xmm3 );
2593 C(i+1UL,j+1UL) +=
sum( xmm4 );
2594 C(i+2UL,j ) +=
sum( xmm5 );
2595 C(i+2UL,j+1UL) +=
sum( xmm6 );
2596 C(i+3UL,j ) +=
sum( xmm7 );
2597 C(i+3UL,j+1UL) +=
sum( xmm8 );
2599 for( ; remainder && k<kend; ++k ) {
2600 C(i ,j ) += A(i ,k) * B(k,j );
2601 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2602 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2603 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2604 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2605 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2606 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
2607 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
2613 const size_t kbegin( ( IsUpper_v<MT4> )
2614 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2615 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2616 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2618 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2626 xmm1 += A.load(i ,k) * b1;
2627 xmm2 += A.load(i+1UL,k) * b1;
2628 xmm3 += A.load(i+2UL,k) * b1;
2629 xmm4 += A.load(i+3UL,k) * b1;
2632 C(i ,j) +=
sum( xmm1 );
2633 C(i+1UL,j) +=
sum( xmm2 );
2634 C(i+2UL,j) +=
sum( xmm3 );
2635 C(i+3UL,j) +=
sum( xmm4 );
2637 for( ; remainder && k<kend; ++k ) {
2638 C(i ,j) += A(i ,k) * B(k,j);
2639 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2640 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2641 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2646 for( ; (i+2UL) <= M; i+=2UL )
2648 const size_t jend(
LOW ? i+2UL : N );
2649 size_t j(
UPP ? i : 0UL );
2651 for( ; (j+2UL) <= jend; j+=2UL )
2653 const size_t kbegin( ( IsUpper_v<MT4> )
2654 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2655 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2656 const size_t kend( ( IsLower_v<MT4> )
2657 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2658 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2660 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2668 const SIMDType a2( A.load(i+1UL,k) );
2670 const SIMDType b2( B.load(k,j+1UL) );
2677 C(i ,j ) +=
sum( xmm1 );
2678 C(i ,j+1UL) +=
sum( xmm2 );
2679 C(i+1UL,j ) +=
sum( xmm3 );
2680 C(i+1UL,j+1UL) +=
sum( xmm4 );
2682 for( ; remainder && k<kend; ++k ) {
2683 C(i ,j ) += A(i ,k) * B(k,j );
2684 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2685 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2686 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2692 const size_t kbegin( ( IsUpper_v<MT4> )
2693 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2694 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2695 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2697 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2705 xmm1 += A.load(i ,k) * b1;
2706 xmm2 += A.load(i+1UL,k) * b1;
2709 C(i ,j) +=
sum( xmm1 );
2710 C(i+1UL,j) +=
sum( xmm2 );
2712 for( ; remainder && k<kend; ++k ) {
2713 C(i ,j) += A(i ,k) * B(k,j);
2714 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2721 const size_t jend(
LOW ? i+1UL : N );
2722 size_t j(
UPP ? i : 0UL );
2724 for( ; (j+2UL) <= jend; j+=2UL )
2726 const size_t kbegin( ( IsUpper_v<MT4> )
2727 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2728 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2729 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2731 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
2739 xmm1 += a1 * B.load(k,j );
2740 xmm2 += a1 * B.load(k,j+1UL);
2743 C(i,j ) +=
sum( xmm1 );
2744 C(i,j+1UL) +=
sum( xmm2 );
2746 for( ; remainder && k<kend; ++k ) {
2747 C(i,j ) += A(i,k) * B(k,j );
2748 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2754 const size_t kbegin( ( IsUpper_v<MT4> )
2755 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
2756 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
2758 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
2765 xmm1 += A.load(i,k) * B.load(k,j);
2768 C(i,j) +=
sum( xmm1 );
2770 for( ; remainder && k<K; ++k ) {
2771 C(i,j) += A(i,k) * B(k,j);
2793 template<
typename MT3
2796 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2797 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2799 selectDefaultAddAssignKernel( C, A, B );
2819 template<
typename MT3
2822 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2823 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2849 template<
typename MT3
2852 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2853 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2855 selectLargeAddAssignKernel( C, A, B );
2861 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 2875 template<
typename MT3
2878 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2879 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2881 using ET = ElementType_t<MT3>;
2883 if( IsTriangular_v<MT4> ) {
2884 ResultType_t<MT3> tmp(
serial( B ) );
2885 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2886 addAssign( C, tmp );
2888 else if( IsTriangular_v<MT5> ) {
2889 ResultType_t<MT3> tmp(
serial( A ) );
2890 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2891 addAssign( C, tmp );
2894 gemm( C, A, B, ET(1), ET(1) );
2918 template<
typename MT
2920 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2927 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2941 DMatTDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
2957 template<
typename MT3
2960 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2962 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
2963 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2964 selectSmallSubAssignKernel( C, A, B );
2966 selectBlasSubAssignKernel( C, A, B );
2985 template<
typename MT3
2988 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2989 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2991 const size_t M( A.rows() );
2992 const size_t N( B.columns() );
2993 const size_t K( A.columns() );
2997 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
2998 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
3000 const size_t iend( ( IsStrictlyUpper_v<MT4> )
3001 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
3005 for(
size_t i=ibegin; i<iend; ++i )
3007 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3008 ?( ( IsStrictlyUpper_v<MT4> )
3009 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
3010 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
3011 :( ( IsStrictlyUpper_v<MT5> )
3012 ?(
UPP ?
max( i, 1UL ) : 1UL )
3013 :(
UPP ? i : 0UL ) ) );
3014 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
3015 ?( ( IsStrictlyLower_v<MT4> )
3016 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
3017 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
3018 :( ( IsStrictlyLower_v<MT5> )
3019 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
3020 :(
LOW ? i+1UL : N ) ) );
3022 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
3025 for(
size_t j=jbegin; j<jend; ++j )
3027 const size_t kbegin( ( IsUpper_v<MT4> )
3028 ?( ( IsLower_v<MT5> )
3029 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3030 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3031 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3032 :( ( IsLower_v<MT5> )
3033 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3035 const size_t kend( ( IsLower_v<MT4> )
3036 ?( ( IsUpper_v<MT5> )
3037 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3038 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3039 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3040 :( ( IsUpper_v<MT5> )
3041 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3045 const size_t knum( kend - kbegin );
3046 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3048 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3049 C(i,j) -= A(i,k ) * B(k ,j);
3050 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3053 C(i,j) -= A(i,kpos) * B(kpos,j);
3075 template<
typename MT3
3078 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3079 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3081 const size_t M( A.rows() );
3082 const size_t N( B.columns() );
3083 const size_t K( A.columns() );
3087 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3088 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3090 const size_t jend( ( IsStrictlyLower_v<MT5> )
3091 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3095 for(
size_t j=jbegin; j<jend; ++j )
3097 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3098 ?( ( IsStrictlyLower_v<MT4> )
3099 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3100 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3101 :( ( IsStrictlyLower_v<MT4> )
3102 ?(
LOW ?
max( j, 1UL ) : 1UL )
3103 :(
LOW ? j : 0UL ) ) );
3104 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3105 ?( ( IsStrictlyUpper_v<MT4> )
3106 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3107 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3108 :( ( IsStrictlyUpper_v<MT4> )
3109 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
3110 :(
UPP ? j+1UL : M ) ) );
3112 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
3115 for(
size_t i=ibegin; i<iend; ++i )
3117 const size_t kbegin( ( IsUpper_v<MT4> )
3118 ?( ( IsLower_v<MT5> )
3119 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3120 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3121 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3122 :( ( IsLower_v<MT5> )
3123 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3125 const size_t kend( ( IsLower_v<MT4> )
3126 ?( ( IsUpper_v<MT5> )
3127 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3128 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3129 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3130 :( ( IsUpper_v<MT5> )
3131 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3135 const size_t knum( kend - kbegin );
3136 const size_t kpos( kbegin + ( knum &
size_t(-2) ) );
3138 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3139 C(i,j) -= A(i,k ) * B(k ,j);
3140 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
3143 C(i,j) -= A(i,kpos) * B(kpos,j);
3165 template<
typename MT3
3168 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3169 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3171 const size_t M( A.rows() );
3172 const size_t N( B.columns() );
3174 for(
size_t i=0UL; i<M; ++i )
3176 const size_t jbegin( ( IsUpper_v<MT4> )
3177 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3179 const size_t jend( ( IsLower_v<MT4> )
3180 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3184 const size_t jnum( jend - jbegin );
3185 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3187 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3188 C(i,j ) -= A(i,j ) * B(j ,j );
3189 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3192 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3213 template<
typename MT3
3216 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3217 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3219 constexpr
size_t block( BLOCK_SIZE );
3221 const size_t M( A.rows() );
3222 const size_t N( B.columns() );
3224 for(
size_t jj=0UL; jj<N; jj+=block ) {
3225 const size_t jend(
min( N, jj+block ) );
3226 for(
size_t ii=0UL; ii<M; ii+=block ) {
3227 const size_t iend(
min( M, ii+block ) );
3228 for(
size_t j=jj; j<jend; ++j )
3230 const size_t ibegin( ( IsLower_v<MT4> )
3231 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3233 const size_t ipos( ( IsUpper_v<MT4> )
3234 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3237 for(
size_t i=ibegin; i<ipos; ++i ) {
3238 C(i,j) -= A(i,j) * B(j,j);
3261 template<
typename MT3
3264 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3265 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3267 constexpr
size_t block( BLOCK_SIZE );
3269 const size_t M( A.rows() );
3270 const size_t N( B.columns() );
3272 for(
size_t ii=0UL; ii<M; ii+=block ) {
3273 const size_t iend(
min( M, ii+block ) );
3274 for(
size_t jj=0UL; jj<N; jj+=block ) {
3275 const size_t jend(
min( N, jj+block ) );
3276 for(
size_t i=ii; i<iend; ++i )
3278 const size_t jbegin( ( IsUpper_v<MT5> )
3279 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3281 const size_t jpos( ( IsLower_v<MT5> )
3282 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3285 for(
size_t j=jbegin; j<jpos; ++j ) {
3286 C(i,j) -= A(i,i) * B(i,j);
3309 template<
typename MT3
3312 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3313 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3315 const size_t M( A.rows() );
3316 const size_t N( B.columns() );
3318 for(
size_t j=0UL; j<N; ++j )
3320 const size_t ibegin( ( IsLower_v<MT5> )
3321 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3323 const size_t iend( ( IsUpper_v<MT5> )
3324 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3328 const size_t inum( iend - ibegin );
3329 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
3331 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3332 C(i ,j) -= A(i ,i ) * B(i ,j);
3333 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3336 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3357 template<
typename MT3
3360 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3361 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3363 for(
size_t i=0UL; i<A.rows(); ++i ) {
3364 C(i,i) -= A(i,i) * B(i,i);
3384 template<
typename MT3
3387 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3388 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3390 selectDefaultSubAssignKernel( C, A, B );
3410 template<
typename MT3
3413 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3414 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3416 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3418 const size_t M( A.rows() );
3419 const size_t N( B.columns() );
3420 const size_t K( A.columns() );
3426 for( ; (i+2UL) <= M; i+=2UL )
3428 const size_t jend(
LOW ? i+2UL : N );
3429 size_t j(
UPP ? i : 0UL );
3431 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3433 const size_t kbegin( ( IsUpper_v<MT4> )
3434 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3435 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3436 const size_t kend( ( IsLower_v<MT4> )
3437 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3438 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3440 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3443 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3448 const SIMDType a2( A.load(i+1UL,k) );
3450 const SIMDType b2( B.load(k,j+1UL) );
3451 const SIMDType b3( B.load(k,j+2UL) );
3452 const SIMDType b4( B.load(k,j+3UL) );
3463 C(i ,j ) -=
sum( xmm1 );
3464 C(i ,j+1UL) -=
sum( xmm2 );
3465 C(i ,j+2UL) -=
sum( xmm3 );
3466 C(i ,j+3UL) -=
sum( xmm4 );
3467 C(i+1UL,j ) -=
sum( xmm5 );
3468 C(i+1UL,j+1UL) -=
sum( xmm6 );
3469 C(i+1UL,j+2UL) -=
sum( xmm7 );
3470 C(i+1UL,j+3UL) -=
sum( xmm8 );
3472 for( ; remainder && k<kend; ++k ) {
3473 C(i ,j ) -= A(i ,k) * B(k,j );
3474 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3475 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
3476 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
3477 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3478 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3479 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
3480 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
3484 for( ; (j+2UL) <= jend; j+=2UL )
3486 const size_t kbegin( ( IsUpper_v<MT4> )
3487 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3488 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3489 const size_t kend( ( IsLower_v<MT4> )
3490 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3491 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3493 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3501 const SIMDType a2( A.load(i+1UL,k) );
3503 const SIMDType b2( B.load(k,j+1UL) );
3510 C(i ,j ) -=
sum( xmm1 );
3511 C(i ,j+1UL) -=
sum( xmm2 );
3512 C(i+1UL,j ) -=
sum( xmm3 );
3513 C(i+1UL,j+1UL) -=
sum( xmm4 );
3515 for( ; remainder && k<kend; ++k ) {
3516 C(i ,j ) -= A(i ,k) * B(k,j );
3517 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3518 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3519 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3525 const size_t kbegin( ( IsUpper_v<MT4> )
3526 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3527 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3528 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3530 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3538 xmm1 += A.load(i ,k) * b1;
3539 xmm2 += A.load(i+1UL,k) * b1;
3542 C(i ,j) -=
sum( xmm1 );
3543 C(i+1UL,j) -=
sum( xmm2 );
3545 for( ; remainder && k<kend; ++k ) {
3546 C(i ,j) -= A(i ,k) * B(k,j);
3547 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3554 const size_t jend(
LOW ? i+1UL : N );
3555 size_t j(
UPP ? i : 0UL );
3557 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3559 const size_t kbegin( ( IsUpper_v<MT4> )
3560 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3561 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3562 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3564 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3572 xmm1 += a1 * B.load(k,j );
3573 xmm2 += a1 * B.load(k,j+1UL);
3574 xmm3 += a1 * B.load(k,j+2UL);
3575 xmm4 += a1 * B.load(k,j+3UL);
3578 C(i,j ) -=
sum( xmm1 );
3579 C(i,j+1UL) -=
sum( xmm2 );
3580 C(i,j+2UL) -=
sum( xmm3 );
3581 C(i,j+3UL) -=
sum( xmm4 );
3583 for( ; remainder && k<kend; ++k ) {
3584 C(i,j ) -= A(i,k) * B(k,j );
3585 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3586 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
3587 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
3591 for( ; (j+2UL) <= jend; j+=2UL )
3593 const size_t kbegin( ( IsUpper_v<MT4> )
3594 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3595 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3596 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3598 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3606 xmm1 += a1 * B.load(k,j );
3607 xmm2 += a1 * B.load(k,j+1UL);
3610 C(i,j ) -=
sum( xmm1 );
3611 C(i,j+1UL) -=
sum( xmm2 );
3613 for( ; remainder && k<kend; ++k ) {
3614 C(i,j ) -= A(i,k) * B(k,j );
3615 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3621 const size_t kbegin( ( IsUpper_v<MT4> )
3622 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3623 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3625 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
3632 xmm1 += A.load(i,k) * B.load(k,j);
3635 C(i,j) -=
sum( xmm1 );
3637 for( ; remainder && k<K; ++k ) {
3638 C(i,j) -= A(i,k) * B(k,j);
3661 template<
typename MT3
3664 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3665 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3667 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3669 const size_t M( A.rows() );
3670 const size_t N( B.columns() );
3671 const size_t K( A.columns() );
3677 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
3681 for( ; (j+2UL) <= N; j+=2UL )
3683 const size_t kbegin( ( IsUpper_v<MT4> )
3684 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3685 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3686 const size_t kend( ( IsLower_v<MT4> )
3687 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3688 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3690 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3693 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3698 const SIMDType a2( A.load(i+1UL,k) );
3699 const SIMDType a3( A.load(i+2UL,k) );
3700 const SIMDType a4( A.load(i+3UL,k) );
3702 const SIMDType b2( B.load(k,j+1UL) );
3713 C(i ,j ) -=
sum( xmm1 );
3714 C(i ,j+1UL) -=
sum( xmm2 );
3715 C(i+1UL,j ) -=
sum( xmm3 );
3716 C(i+1UL,j+1UL) -=
sum( xmm4 );
3717 C(i+2UL,j ) -=
sum( xmm5 );
3718 C(i+2UL,j+1UL) -=
sum( xmm6 );
3719 C(i+3UL,j ) -=
sum( xmm7 );
3720 C(i+3UL,j+1UL) -=
sum( xmm8 );
3722 for( ; remainder && k<kend; ++k ) {
3723 C(i ,j ) -= A(i ,k) * B(k,j );
3724 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3725 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3726 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3727 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3728 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
3729 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3730 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
3736 const size_t kbegin( ( IsUpper_v<MT4> )
3737 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3738 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3739 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
3741 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3749 xmm1 += A.load(i ,k) * b1;
3750 xmm2 += A.load(i+1UL,k) * b1;
3751 xmm3 += A.load(i+2UL,k) * b1;
3752 xmm4 += A.load(i+3UL,k) * b1;
3755 C(i ,j) -=
sum( xmm1 );
3756 C(i+1UL,j) -=
sum( xmm2 );
3757 C(i+2UL,j) -=
sum( xmm3 );
3758 C(i+3UL,j) -=
sum( xmm4 );
3760 for( ; remainder && k<kend; ++k ) {
3761 C(i ,j ) -= A(i ,k) * B(k,j );
3762 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3763 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
3764 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
3769 for( ; (i+2UL) <= M; i+=2UL )
3771 const size_t jend(
LOW ? i+2UL : N );
3772 size_t j(
UPP ? i : 0UL );
3774 for( ; (j+2UL) <= jend; j+=2UL )
3776 const size_t kbegin( ( IsUpper_v<MT4> )
3777 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3778 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3779 const size_t kend( ( IsLower_v<MT4> )
3780 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3781 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3783 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3791 const SIMDType a2( A.load(i+1UL,k) );
3793 const SIMDType b2( B.load(k,j+1UL) );
3800 C(i ,j ) -=
sum( xmm1 );
3801 C(i ,j+1UL) -=
sum( xmm2 );
3802 C(i+1UL,j ) -=
sum( xmm3 );
3803 C(i+1UL,j+1UL) -=
sum( xmm4 );
3805 for( ; remainder && k<kend; ++k ) {
3806 C(i ,j ) -= A(i ,k) * B(k,j );
3807 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
3808 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
3809 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
3815 const size_t kbegin( ( IsUpper_v<MT4> )
3816 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3817 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3818 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3820 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3828 xmm1 += A.load(i ,k) * b1;
3829 xmm2 += A.load(i+1UL,k) * b1;
3832 C(i ,j) -=
sum( xmm1 );
3833 C(i+1UL,j) -=
sum( xmm2 );
3835 for( ; remainder && k<kend; ++k ) {
3836 C(i ,j) -= A(i ,k) * B(k,j);
3837 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3844 const size_t jend(
LOW ? i+1UL : N );
3845 size_t j(
UPP ? i : 0UL );
3847 for( ; (j+2UL) <= jend; j+=2UL )
3849 const size_t kbegin( ( IsUpper_v<MT4> )
3850 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3851 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3852 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3854 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
3862 xmm1 += a1 * B.load(k,j );
3863 xmm2 += a1 * B.load(k,j+1UL);
3866 C(i,j ) -=
sum( xmm1 );
3867 C(i,j+1UL) -=
sum( xmm2 );
3869 for( ; remainder && k<kend; ++k ) {
3870 C(i,j ) -= A(i,k) * B(k,j );
3871 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3877 const size_t kbegin( ( IsUpper_v<MT4> )
3878 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
3879 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
3881 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
3888 xmm1 += A.load(i,k) * B.load(k,j);
3891 C(i,j) -=
sum( xmm1 );
3893 for( ; remainder && k<K; ++k ) {
3894 C(i,j) -= A(i,k) * B(k,j);
3916 template<
typename MT3
3919 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3920 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3922 selectDefaultSubAssignKernel( C, A, B );
3942 template<
typename MT3
3945 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3946 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3972 template<
typename MT3
3975 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3976 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3978 selectLargeSubAssignKernel( C, A, B );
3984 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3998 template<
typename MT3
4001 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4002 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4004 using ET = ElementType_t<MT3>;
4006 if( IsTriangular_v<MT4> ) {
4007 ResultType_t<MT3> tmp(
serial( B ) );
4008 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4009 subAssign( C, tmp );
4011 else if( IsTriangular_v<MT5> ) {
4012 ResultType_t<MT3> tmp(
serial( A ) );
4013 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4014 subAssign( C, tmp );
4017 gemm( C, A, B, ET(-1), ET(1) );
4041 template<
typename MT
4043 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
4055 schurAssign( ~lhs, tmp );
4087 template<
typename MT
4090 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4097 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4100 else if( rhs.lhs_.columns() == 0UL ) {
4135 template<
typename MT
4138 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4142 using TmpType = If_t< SO, OppositeType, ResultType >;
4154 const ForwardFunctor fwd;
4156 const TmpType tmp( rhs );
4178 template<
typename MT
4181 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4188 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4227 template<
typename MT
4230 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4237 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4273 template<
typename MT
4333 template<
typename MT1
4340 class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4341 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4342 ,
private Computation
4347 using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4349 using RES = ResultType_t<MMM>;
4350 using RT1 = ResultType_t<MT1>;
4351 using RT2 = ResultType_t<MT2>;
4352 using ET1 = ElementType_t<RT1>;
4353 using ET2 = ElementType_t<RT2>;
4354 using CT1 = CompositeType_t<MT1>;
4355 using CT2 = CompositeType_t<MT2>;
4360 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4365 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4369 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4370 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4371 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4372 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4380 template<
typename T1,
typename T2,
typename T3 >
4381 static constexpr
bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
4388 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4389 static constexpr
bool UseBlasKernel_v =
4391 !SYM && !HERM && !LOW && !UPP &&
4392 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4393 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4394 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4395 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4396 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4397 IsBLASCompatible_v< ElementType_t<T1> > &&
4398 IsBLASCompatible_v< ElementType_t<T2> > &&
4399 IsBLASCompatible_v< ElementType_t<T3> > &&
4400 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4401 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4402 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4409 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4410 static constexpr
bool UseVectorizedDefaultKernel_v =
4411 ( useOptimizedKernels &&
4412 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4413 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4414 IsSIMDCombinable_v< ElementType_t<T1>
4418 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4419 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4426 using ForwardFunctor =
If_t< HERM
4442 using This = DMatScalarMultExpr<MMM,ST,false>;
4445 using BaseType = DenseMatrix<This,false>;
4449 , DeclHermTrait< MultTrait_t<RES,ST> >
4451 , DeclSymTrait< MultTrait_t<RES,ST> >
4454 , DeclDiagTrait< MultTrait_t<RES,ST> >
4455 , DeclLowTrait< MultTrait_t<RES,ST> > >
4457 , DeclUppTrait< MultTrait_t<RES,ST> >
4458 , MultTrait<RES,ST> > > > >::Type;
4463 using SIMDType = SIMDTrait_t<ElementType>;
4468 using LeftOperand =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4474 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4477 using RT = If_t< evaluateRight, const RT2, CT2 >;
4483 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
4484 MT1::simdEnabled && MT2::simdEnabled &&
4485 IsSIMDCombinable_v<ET1,ET2,ST> &&
4486 HasSIMDAdd_v<ET1,ET2> &&
4487 HasSIMDMult_v<ET1,ET2> );
4537 if( j >=
matrix_.columns() ) {
4540 return (*
this)(i,j);
4549 inline size_t rows()
const {
4559 inline size_t columns()
const {
4590 template<
typename T >
4591 inline bool canAlias(
const T* alias )
const {
4592 return matrix_.canAlias( alias );
4602 template<
typename T >
4603 inline bool isAliased(
const T* alias )
const {
4604 return matrix_.isAliased( alias );
4625 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
4627 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
4628 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
4650 template<
typename MT
4659 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
4660 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
4662 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4665 else if( left.columns() == 0UL ) {
4680 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
4695 template<
typename MT3
4699 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4701 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4702 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4703 selectSmallAssignKernel( C, A, B, scalar );
4705 selectBlasAssignKernel( C, A, B, scalar );
4723 template<
typename MT3
4727 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4728 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4730 const size_t M( A.rows() );
4731 const size_t N( B.columns() );
4732 const size_t K( A.columns() );
4736 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4737 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4739 const size_t iend( ( IsStrictlyUpper_v<MT4> )
4740 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4744 for(
size_t i=0UL; i<ibegin; ++i ) {
4745 for(
size_t j=0UL; j<N; ++j ) {
4749 for(
size_t i=ibegin; i<iend; ++i )
4751 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4752 ?( ( IsStrictlyUpper_v<MT4> )
4753 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4754 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4755 :( ( IsStrictlyUpper_v<MT5> )
4756 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
4757 :( SYM || HERM || UPP ? i : 0UL ) ) );
4758 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4759 ?( ( IsStrictlyLower_v<MT4> )
4760 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4761 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4762 :( ( IsStrictlyLower_v<MT5> )
4763 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
4764 :( LOW ? i+1UL : N ) ) );
4766 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
4767 for(
size_t j=0UL; j<N; ++j ) {
4775 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
4778 for(
size_t j=jbegin; j<jend; ++j )
4780 const size_t kbegin( ( IsUpper_v<MT4> )
4781 ?( ( IsLower_v<MT5> )
4782 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4783 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4784 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4785 :( ( IsLower_v<MT5> )
4786 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4788 const size_t kend( ( IsLower_v<MT4> )
4789 ?( ( IsUpper_v<MT5> )
4790 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4791 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4792 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4793 :( ( IsUpper_v<MT5> )
4794 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4798 C(i,j) = A(i,kbegin) * B(kbegin,j);
4799 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4800 C(i,j) += A(i,k) * B(k,j);
4804 for(
size_t j=jend; j<N; ++j ) {
4808 for(
size_t i=iend; i<M; ++i ) {
4809 for(
size_t j=0UL; j<N; ++j ) {
4815 for(
size_t i=1UL; i<M; ++i ) {
4816 for(
size_t j=0UL; j<i; ++j ) {
4817 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
4838 template<
typename MT3
4842 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4843 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4845 const size_t M( A.rows() );
4846 const size_t N( B.columns() );
4847 const size_t K( A.columns() );
4851 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4852 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4854 const size_t jend( ( IsStrictlyLower_v<MT5> )
4855 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4859 for(
size_t j=0UL; j<jbegin; ++j ) {
4860 for(
size_t i=0UL; i<M; ++i ) {
4864 for(
size_t j=jbegin; j<jend; ++j )
4866 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4867 ?( ( IsStrictlyLower_v<MT4> )
4868 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4869 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4870 :( ( IsStrictlyLower_v<MT4> )
4871 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
4872 :( SYM || HERM || LOW ? j : 0UL ) ) );
4873 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4874 ?( ( IsStrictlyUpper_v<MT4> )
4875 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4876 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4877 :( ( IsStrictlyUpper_v<MT4> )
4878 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
4879 :( UPP ? j+1UL : M ) ) );
4881 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
4882 for(
size_t i=0UL; i<M; ++i ) {
4890 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
4893 for(
size_t i=ibegin; i<iend; ++i )
4895 const size_t kbegin( ( IsUpper_v<MT4> )
4896 ?( ( IsLower_v<MT5> )
4897 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4898 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4899 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4900 :( ( IsLower_v<MT5> )
4901 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4903 const size_t kend( ( IsLower_v<MT4> )
4904 ?( ( IsUpper_v<MT5> )
4905 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4906 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4907 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4908 :( ( IsUpper_v<MT5> )
4909 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4913 C(i,j) = A(i,kbegin) * B(kbegin,j);
4914 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
4915 C(i,j) += A(i,k) * B(k,j);
4919 for(
size_t i=iend; i<M; ++i ) {
4923 for(
size_t j=jend; j<N; ++j ) {
4924 for(
size_t i=0UL; i<M; ++i ) {
4930 for(
size_t j=1UL; j<N; ++j ) {
4931 for(
size_t i=0UL; i<j; ++i ) {
4932 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
4953 template<
typename MT3
4957 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
4958 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4960 const size_t M( A.rows() );
4961 const size_t N( B.columns() );
4963 for(
size_t i=0UL; i<M; ++i )
4965 const size_t jbegin( ( IsUpper_v<MT4> )
4966 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4968 const size_t jend( ( IsLower_v<MT4> )
4969 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4973 if( IsUpper_v<MT4> ) {
4974 for(
size_t j=0UL; j<jbegin; ++j ) {
4978 for(
size_t j=jbegin; j<jend; ++j ) {
4979 C(i,j) = A(i,j) * B(j,j) * scalar;
4981 if( IsLower_v<MT4> ) {
4982 for(
size_t j=jend; j<N; ++j ) {
5004 template<
typename MT3
5008 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5009 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5011 constexpr
size_t block( BLOCK_SIZE );
5013 const size_t M( A.rows() );
5014 const size_t N( B.columns() );
5016 for(
size_t jj=0UL; jj<N; jj+=block ) {
5017 const size_t jend(
min( N, jj+block ) );
5018 for(
size_t ii=0UL; ii<M; ii+=block ) {
5019 const size_t iend(
min( M, ii+block ) );
5020 for(
size_t j=jj; j<jend; ++j )
5022 const size_t ibegin( ( IsLower_v<MT4> )
5023 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
5025 const size_t ipos( ( IsUpper_v<MT4> )
5026 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
5029 if( IsLower_v<MT4> ) {
5030 for(
size_t i=ii; i<ibegin; ++i ) {
5034 for(
size_t i=ibegin; i<ipos; ++i ) {
5035 C(i,j) = A(i,j) * B(j,j) * scalar;
5037 if( IsUpper_v<MT4> ) {
5038 for(
size_t i=ipos; i<iend; ++i ) {
5062 template<
typename MT3
5066 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5067 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5069 constexpr
size_t block( BLOCK_SIZE );
5071 const size_t M( A.rows() );
5072 const size_t N( B.columns() );
5074 for(
size_t ii=0UL; ii<M; ii+=block ) {
5075 const size_t iend(
min( M, ii+block ) );
5076 for(
size_t jj=0UL; jj<N; jj+=block ) {
5077 const size_t jend(
min( N, jj+block ) );
5078 for(
size_t i=ii; i<iend; ++i )
5080 const size_t jbegin( ( IsUpper_v<MT5> )
5081 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
5083 const size_t jpos( ( IsLower_v<MT5> )
5084 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
5087 if( IsUpper_v<MT5> ) {
5088 for(
size_t j=jj; j<jbegin; ++j ) {
5092 for(
size_t j=jbegin; j<jpos; ++j ) {
5093 C(i,j) = A(i,i) * B(i,j) * scalar;
5095 if( IsLower_v<MT5> ) {
5096 for(
size_t j=jpos; j<jend; ++j ) {
5120 template<
typename MT3
5124 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5125 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5127 const size_t M( A.rows() );
5128 const size_t N( B.columns() );
5130 for(
size_t j=0UL; j<N; ++j )
5132 const size_t ibegin( ( IsLower_v<MT5> )
5133 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5135 const size_t iend( ( IsUpper_v<MT5> )
5136 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5140 if( IsLower_v<MT5> ) {
5141 for(
size_t i=0UL; i<ibegin; ++i ) {
5145 for(
size_t i=ibegin; i<iend; ++i ) {
5146 C(i,j) = A(i,i) * B(i,j) * scalar;
5148 if( IsUpper_v<MT5> ) {
5149 for(
size_t i=iend; i<M; ++i ) {
5171 template<
typename MT3
5175 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5176 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5180 for(
size_t i=0UL; i<A.rows(); ++i ) {
5181 C(i,i) = A(i,i) * B(i,i) * scalar;
5200 template<
typename MT3
5204 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5205 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5207 selectDefaultAssignKernel( C, A, B, scalar );
5226 template<
typename MT3
5230 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5231 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5233 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5235 const size_t M( A.rows() );
5236 const size_t N( B.columns() );
5237 const size_t K( A.columns() );
5248 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
5250 const size_t jend( LOW ? i+2UL : N );
5251 size_t j( SYM || HERM || UPP ? i : 0UL );
5253 for( ; (j+4UL) <= jend; j+=4UL )
5255 const size_t kbegin( ( IsUpper_v<MT4> )
5256 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5257 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5258 const size_t kend( ( IsLower_v<MT4> )
5259 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5260 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5262 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5265 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5269 const SIMDType a1( A.load(i ,k) );
5270 const SIMDType a2( A.load(i+1UL,k) );
5271 const SIMDType b1( B.load(k,j ) );
5272 const SIMDType b2( B.load(k,j+1UL) );
5273 const SIMDType b3( B.load(k,j+2UL) );
5274 const SIMDType b4( B.load(k,j+3UL) );
5285 C(i ,j ) =
sum( xmm1 ) * scalar;
5286 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5287 C(i ,j+2UL) =
sum( xmm3 ) * scalar;
5288 C(i ,j+3UL) =
sum( xmm4 ) * scalar;
5289 C(i+1UL,j ) =
sum( xmm5 ) * scalar;
5290 C(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
5291 C(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
5292 C(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
5294 for( ; remainder && k<kend; ++k ) {
5295 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5296 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5297 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
5298 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
5299 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5300 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5301 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
5302 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
5306 for( ; (j+2UL) <= jend; j+=2UL )
5308 const size_t kbegin( ( IsUpper_v<MT4> )
5309 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5310 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5311 const size_t kend( ( IsLower_v<MT4> )
5312 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5313 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5315 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5318 SIMDType xmm1, xmm2, xmm3, xmm4;
5322 const SIMDType a1( A.load(i ,k) );
5323 const SIMDType a2( A.load(i+1UL,k) );
5324 const SIMDType b1( B.load(k,j ) );
5325 const SIMDType b2( B.load(k,j+1UL) );
5332 C(i ,j ) =
sum( xmm1 ) * scalar;
5333 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5334 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5335 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5337 for( ; remainder && k<kend; ++k ) {
5338 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5339 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5340 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5341 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5347 const size_t kbegin( ( IsUpper_v<MT4> )
5348 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5349 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5350 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5352 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5355 SIMDType xmm1, xmm2;
5359 const SIMDType b1( B.load(k,j) );
5360 xmm1 += A.load(i ,k) * b1;
5361 xmm2 += A.load(i+1UL,k) * b1;
5364 C(i ,j) =
sum( xmm1 ) * scalar;
5365 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5367 for( ; remainder && k<kend; ++k ) {
5368 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5369 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5376 const size_t jend( LOW ? i+1UL : N );
5377 size_t j( SYM || HERM || UPP ? i : 0UL );
5379 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
5381 const size_t kbegin( ( IsUpper_v<MT4> )
5382 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5383 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5384 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5386 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5389 SIMDType xmm1, xmm2, xmm3, xmm4;
5393 const SIMDType a1( A.load(i,k) );
5394 xmm1 += a1 * B.load(k,j );
5395 xmm2 += a1 * B.load(k,j+1UL);
5396 xmm3 += a1 * B.load(k,j+2UL);
5397 xmm4 += a1 * B.load(k,j+3UL);
5400 C(i,j ) =
sum( xmm1 ) * scalar;
5401 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5402 C(i,j+2UL) =
sum( xmm3 ) * scalar;
5403 C(i,j+3UL) =
sum( xmm4 ) * scalar;
5405 for( ; remainder && k<kend; ++k ) {
5406 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5407 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5408 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
5409 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
5413 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
5415 const size_t kbegin( ( IsUpper_v<MT4> )
5416 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5417 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5418 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5420 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5423 SIMDType xmm1, xmm2;
5427 const SIMDType a1( A.load(i,k) );
5428 xmm1 += a1 * B.load(k,j );
5429 xmm2 += a1 * B.load(k,j+1UL);
5432 C(i,j ) =
sum( xmm1 ) * scalar;
5433 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5435 for( ; remainder && k<kend; ++k ) {
5436 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5437 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5441 for( ; j<jend; ++j )
5443 const size_t kbegin( ( IsUpper_v<MT4> )
5444 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5445 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5447 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
5454 xmm1 += A.load(i,k) * B.load(k,j);
5457 C(i,j) =
sum( xmm1 ) * scalar;
5459 for( ; remainder && k<K; ++k ) {
5460 C(i,j) += A(i,k) * B(k,j) * scalar;
5467 for(
size_t i=2UL; i<M; ++i ) {
5468 const size_t jend( 2UL * ( i/2UL ) );
5469 for(
size_t j=0UL; j<jend; ++j ) {
5470 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5474 else if( LOW && !UPP ) {
5475 for(
size_t j=2UL; j<N; ++j ) {
5476 const size_t iend( 2UL * ( j/2UL ) );
5477 for(
size_t i=0UL; i<iend; ++i ) {
5482 else if( !LOW && UPP ) {
5483 for(
size_t i=2UL; i<M; ++i ) {
5484 const size_t jend( 2UL * ( i/2UL ) );
5485 for(
size_t j=0UL; j<jend; ++j ) {
5508 template<
typename MT3
5512 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5513 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5515 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5517 const size_t M( A.rows() );
5518 const size_t N( B.columns() );
5519 const size_t K( A.columns() );
5530 for( ; !SYM && !HERM && !LOW && !UPP && (i+4UL) <= M; i+=4UL )
5534 for( ; (j+2UL) <= N; j+=2UL )
5536 const size_t kbegin( ( IsUpper_v<MT4> )
5537 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5538 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5539 const size_t kend( ( IsLower_v<MT4> )
5540 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5541 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5543 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5546 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5550 const SIMDType a1( A.load(i ,k) );
5551 const SIMDType a2( A.load(i+1UL,k) );
5552 const SIMDType a3( A.load(i+2UL,k) );
5553 const SIMDType a4( A.load(i+3UL,k) );
5554 const SIMDType b1( B.load(k,j ) );
5555 const SIMDType b2( B.load(k,j+1UL) );
5566 C(i ,j ) =
sum( xmm1 ) * scalar;
5567 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5568 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5569 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5570 C(i+2UL,j ) =
sum( xmm5 ) * scalar;
5571 C(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
5572 C(i+3UL,j ) =
sum( xmm7 ) * scalar;
5573 C(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
5575 for( ; remainder && k<kend; ++k ) {
5576 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5577 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5578 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5579 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5580 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
5581 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
5582 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
5583 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
5589 const size_t kbegin( ( IsUpper_v<MT4> )
5590 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5591 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5592 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5594 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5597 SIMDType xmm1, xmm2, xmm3, xmm4;
5601 const SIMDType b1( B.load(k,j) );
5602 xmm1 += A.load(i ,k) * b1;
5603 xmm2 += A.load(i+1UL,k) * b1;
5604 xmm3 += A.load(i+2UL,k) * b1;
5605 xmm4 += A.load(i+3UL,k) * b1;
5608 C(i ,j) =
sum( xmm1 ) * scalar;
5609 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5610 C(i+2UL,j) =
sum( xmm3 ) * scalar;
5611 C(i+3UL,j) =
sum( xmm4 ) * scalar;
5613 for( ; remainder && k<kend; ++k ) {
5614 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5615 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5616 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
5617 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
5622 for( ; (i+2UL) <= M; i+=2UL )
5624 const size_t jend( LOW ? i+2UL : N );
5625 size_t j( SYM || HERM || UPP ? i : 0UL );
5627 for( ; (j+2UL) <= jend; j+=2UL )
5629 const size_t kbegin( ( IsUpper_v<MT4> )
5630 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5631 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5632 const size_t kend( ( IsLower_v<MT4> )
5633 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5634 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5636 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5639 SIMDType xmm1, xmm2, xmm3, xmm4;
5643 const SIMDType a1( A.load(i ,k) );
5644 const SIMDType a2( A.load(i+1UL,k) );
5645 const SIMDType b1( B.load(k,j ) );
5646 const SIMDType b2( B.load(k,j+1UL) );
5653 C(i ,j ) =
sum( xmm1 ) * scalar;
5654 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
5655 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
5656 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
5658 for( ; remainder && k<kend; ++k ) {
5659 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
5660 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
5661 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
5662 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
5668 const size_t kbegin( ( IsUpper_v<MT4> )
5669 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5670 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5671 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5673 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5676 SIMDType xmm1, xmm2;
5680 const SIMDType b1( B.load(k,j) );
5681 xmm1 += A.load(i ,k) * b1;
5682 xmm2 += A.load(i+1UL,k) * b1;
5685 C(i ,j) =
sum( xmm1 ) * scalar;
5686 C(i+1UL,j) =
sum( xmm2 ) * scalar;
5688 for( ; remainder && k<kend; ++k ) {
5689 C(i ,j) += A(i ,k) * B(k,j) * scalar;
5690 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
5697 const size_t jend( LOW ? i+1UL : N );
5698 size_t j( SYM || HERM || UPP ? i : 0UL );
5700 for( ; (j+2UL) <= jend; j+=2UL )
5702 const size_t kbegin( ( IsUpper_v<MT4> )
5703 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5704 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5705 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5707 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
5710 SIMDType xmm1, xmm2;
5714 const SIMDType a1( A.load(i,k) );
5715 xmm1 += a1 * B.load(k,j );
5716 xmm2 += a1 * B.load(k,j+1UL);
5719 C(i,j ) =
sum( xmm1 ) * scalar;
5720 C(i,j+1UL) =
sum( xmm2 ) * scalar;
5722 for( ; remainder && k<kend; ++k ) {
5723 C(i,j ) += A(i,k) * B(k,j ) * scalar;
5724 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
5730 const size_t kbegin( ( IsUpper_v<MT4> )
5731 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
5732 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
5734 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
5741 xmm1 += A.load(i,k) * B.load(k,j);
5744 C(i,j) =
sum( xmm1 ) * scalar;
5746 for( ; remainder && k<K; ++k ) {
5747 C(i,j) += A(i,k) * B(k,j) * scalar;
5754 for(
size_t j=0UL; j<N; ++j ) {
5755 for(
size_t i=j+1UL; i<M; ++i ) {
5756 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5777 template<
typename MT3
5781 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5782 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5784 selectDefaultAssignKernel( C, A, B, scalar );
5803 template<
typename MT3
5807 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5808 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5811 smmm( C, A, B, scalar );
5813 hmmm( C, A, B, scalar );
5815 lmmm( C, A, B, scalar, ST2(0) );
5817 ummm( C, A, B, scalar, ST2(0) );
5819 mmm( C, A, B, scalar, ST2(0) );
5837 template<
typename MT3
5841 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5842 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5844 selectLargeAssignKernel( C, A, B, scalar );
5849 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 5863 template<
typename MT3
5867 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5868 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
5870 using ET = ElementType_t<MT3>;
5872 if( IsTriangular_v<MT4> ) {
5874 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
5876 else if( IsTriangular_v<MT5> ) {
5878 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
5881 gemm( C, A, B,
ET(scalar),
ET(0) );
5899 template<
typename MT
5905 using TmpType = If_t< SO, OppositeType, ResultType >;
5917 const ForwardFunctor fwd;
5919 const TmpType tmp(
serial( rhs ) );
5920 assign( ~lhs, fwd( tmp ) );
5936 template<
typename MT
5938 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
5945 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5946 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5948 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
5962 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
5977 template<
typename MT3
5981 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5983 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
5984 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
5985 selectSmallAddAssignKernel( C, A, B, scalar );
5987 selectBlasAddAssignKernel( C, A, B, scalar );
6005 template<
typename MT3
6009 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6010 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6013 addAssign( C, tmp );
6031 template<
typename MT3
6035 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6036 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6038 const size_t M( A.rows() );
6039 const size_t N( B.columns() );
6041 for(
size_t i=0UL; i<M; ++i )
6043 const size_t jbegin( ( IsUpper_v<MT4> )
6044 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6046 const size_t jend( ( IsLower_v<MT4> )
6047 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6051 const size_t jnum( jend - jbegin );
6052 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6054 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6055 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6056 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6059 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6079 template<
typename MT3
6083 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6084 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6086 constexpr
size_t block( BLOCK_SIZE );
6088 const size_t M( A.rows() );
6089 const size_t N( B.columns() );
6091 for(
size_t jj=0UL; jj<N; jj+=block ) {
6092 const size_t jend(
min( N, jj+block ) );
6093 for(
size_t ii=0UL; ii<M; ii+=block ) {
6094 const size_t iend(
min( M, ii+block ) );
6095 for(
size_t j=jj; j<jend; ++j )
6097 const size_t ibegin( ( IsLower_v<MT4> )
6098 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
6100 const size_t ipos( ( IsUpper_v<MT4> )
6101 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
6104 for(
size_t i=ibegin; i<ipos; ++i ) {
6105 C(i,j) += A(i,j) * B(j,j) * scalar;
6127 template<
typename MT3
6131 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6132 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6134 constexpr
size_t block( BLOCK_SIZE );
6136 const size_t M( A.rows() );
6137 const size_t N( B.columns() );
6139 for(
size_t ii=0UL; ii<M; ii+=block ) {
6140 const size_t iend(
min( M, ii+block ) );
6141 for(
size_t jj=0UL; jj<N; jj+=block ) {
6142 const size_t jend(
min( N, jj+block ) );
6143 for(
size_t i=ii; i<iend; ++i )
6145 const size_t jbegin( ( IsUpper_v<MT5> )
6146 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
6148 const size_t jpos( ( IsLower_v<MT5> )
6149 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
6152 for(
size_t j=jbegin; j<jpos; ++j ) {
6153 C(i,j) += A(i,i) * B(i,j) * scalar;
6175 template<
typename MT3
6179 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6180 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6182 const size_t M( A.rows() );
6183 const size_t N( B.columns() );
6185 for(
size_t j=0UL; j<N; ++j )
6187 const size_t ibegin( ( IsLower_v<MT5> )
6188 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6190 const size_t iend( ( IsUpper_v<MT5> )
6191 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6195 const size_t inum( iend - ibegin );
6196 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
6198 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6199 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6200 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6203 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6223 template<
typename MT3
6227 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6228 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6230 for(
size_t i=0UL; i<A.rows(); ++i ) {
6231 C(i,i) += A(i,i) * B(i,i) * scalar;
6250 template<
typename MT3
6254 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6255 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6257 selectDefaultAddAssignKernel( C, A, B, scalar );
6276 template<
typename MT3
6280 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6281 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6283 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6285 const size_t M( A.rows() );
6286 const size_t N( B.columns() );
6287 const size_t K( A.columns() );
6293 for( ; (i+2UL) <= M; i+=2UL )
6295 const size_t jend( LOW ? i+2UL : N );
6296 size_t j( UPP ? i : 0UL );
6298 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6300 const size_t kbegin( ( IsUpper_v<MT4> )
6301 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6302 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6303 const size_t kend( ( IsLower_v<MT4> )
6304 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
6305 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
6307 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6310 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6314 const SIMDType a1( A.load(i ,k) );
6315 const SIMDType a2( A.load(i+1UL,k) );
6316 const SIMDType b1( B.load(k,j ) );
6317 const SIMDType b2( B.load(k,j+1UL) );
6318 const SIMDType b3( B.load(k,j+2UL) );
6319 const SIMDType b4( B.load(k,j+3UL) );
6330 C(i ,j ) +=
sum( xmm1 ) * scalar;
6331 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6332 C(i ,j+2UL) +=
sum( xmm3 ) * scalar;
6333 C(i ,j+3UL) +=
sum( xmm4 ) * scalar;
6334 C(i+1UL,j ) +=
sum( xmm5 ) * scalar;
6335 C(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
6336 C(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
6337 C(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
6339 for( ; remainder && k<kend; ++k ) {
6340 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6341 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6342 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
6343 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
6344 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6345 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6346 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
6347 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
6351 for( ; (j+2UL) <= jend; j+=2UL )
6353 const size_t kbegin( ( IsUpper_v<MT4> )
6354 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6355 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6356 const size_t kend( ( IsLower_v<MT4> )
6357 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6358 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6360 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6363 SIMDType xmm1, xmm2, xmm3, xmm4;
6367 const SIMDType a1( A.load(i ,k) );
6368 const SIMDType a2( A.load(i+1UL,k) );
6369 const SIMDType b1( B.load(k,j ) );
6370 const SIMDType b2( B.load(k,j+1UL) );
6377 C(i ,j ) +=
sum( xmm1 ) * scalar;
6378 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6379 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6380 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6382 for( ; remainder && k<kend; ++k ) {
6383 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6384 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6385 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6386 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6392 const size_t kbegin( ( IsUpper_v<MT4> )
6393 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6394 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6395 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6397 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6400 SIMDType xmm1, xmm2;
6404 const SIMDType b1( B.load(k,j) );
6405 xmm1 += A.load(i ,k) * b1;
6406 xmm2 += A.load(i+1UL,k) * b1;
6409 C(i ,j) +=
sum( xmm1 ) * scalar;
6410 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6412 for( ; remainder && k<kend; ++k ) {
6413 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6414 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6421 const size_t jend( LOW ? i+1UL : N );
6422 size_t j( UPP ? i : 0UL );
6424 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
6426 const size_t kbegin( ( IsUpper_v<MT4> )
6427 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6428 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6429 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
6431 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6434 SIMDType xmm1, xmm2, xmm3, xmm4;
6438 const SIMDType a1( A.load(i,k) );
6439 xmm1 += a1 * B.load(k,j );
6440 xmm2 += a1 * B.load(k,j+1UL);
6441 xmm3 += a1 * B.load(k,j+2UL);
6442 xmm4 += a1 * B.load(k,j+3UL);
6445 C(i,j ) +=
sum( xmm1 ) * scalar;
6446 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6447 C(i,j+2UL) +=
sum( xmm3 ) * scalar;
6448 C(i,j+3UL) +=
sum( xmm4 ) * scalar;
6450 for( ; remainder && k<kend; ++k ) {
6451 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6452 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6453 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
6454 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
6458 for( ; (j+2UL) <= jend; j+=2UL )
6460 const size_t kbegin( ( IsUpper_v<MT4> )
6461 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6462 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6463 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6465 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6468 SIMDType xmm1, xmm2;
6472 const SIMDType a1( A.load(i,k) );
6473 xmm1 += a1 * B.load(k,j );
6474 xmm2 += a1 * B.load(k,j+1UL);
6477 C(i,j ) +=
sum( xmm1 ) * scalar;
6478 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6480 for( ; remainder && k<kend; ++k ) {
6481 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6482 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6488 const size_t kbegin( ( IsUpper_v<MT4> )
6489 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6490 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6492 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
6499 xmm1 += A.load(i,k) * B.load(k,j);
6502 C(i,j) +=
sum( xmm1 ) * scalar;
6504 for( ; remainder && k<K; ++k ) {
6505 C(i,j) += A(i,k) * B(k,j) * scalar;
6527 template<
typename MT3
6531 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6532 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6534 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
6536 const size_t M( A.rows() );
6537 const size_t N( B.columns() );
6538 const size_t K( A.columns() );
6544 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
6548 for( ; (j+2UL) <= N; j+=2UL )
6550 const size_t kbegin( ( IsUpper_v<MT4> )
6551 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6552 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6553 const size_t kend( ( IsLower_v<MT4> )
6554 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
6555 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6557 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6560 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6564 const SIMDType a1( A.load(i ,k) );
6565 const SIMDType a2( A.load(i+1UL,k) );
6566 const SIMDType a3( A.load(i+2UL,k) );
6567 const SIMDType a4( A.load(i+3UL,k) );
6568 const SIMDType b1( B.load(k,j ) );
6569 const SIMDType b2( B.load(k,j+1UL) );
6580 C(i ,j ) +=
sum( xmm1 ) * scalar;
6581 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6582 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6583 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6584 C(i+2UL,j ) +=
sum( xmm5 ) * scalar;
6585 C(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
6586 C(i+3UL,j ) +=
sum( xmm7 ) * scalar;
6587 C(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
6589 for( ; remainder && k<kend; ++k ) {
6590 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6591 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6592 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6593 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6594 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
6595 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
6596 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
6597 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
6603 const size_t kbegin( ( IsUpper_v<MT4> )
6604 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6605 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6606 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
6608 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6611 SIMDType xmm1, xmm2, xmm3, xmm4;
6615 const SIMDType b1( B.load(k,j) );
6616 xmm1 += A.load(i ,k) * b1;
6617 xmm2 += A.load(i+1UL,k) * b1;
6618 xmm3 += A.load(i+2UL,k) * b1;
6619 xmm4 += A.load(i+3UL,k) * b1;
6622 C(i ,j) +=
sum( xmm1 ) * scalar;
6623 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6624 C(i+2UL,j) +=
sum( xmm3 ) * scalar;
6625 C(i+3UL,j) +=
sum( xmm4 ) * scalar;
6627 for( ; remainder && k<kend; ++k ) {
6628 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6629 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6630 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
6631 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
6636 for( ; (i+2UL) <= M; i+=2UL )
6638 const size_t jend( LOW ? i+2UL : N );
6639 size_t j( UPP ? i : 0UL );
6641 for( ; (j+2UL) <= jend; j+=2UL )
6643 const size_t kbegin( ( IsUpper_v<MT4> )
6644 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6645 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6646 const size_t kend( ( IsLower_v<MT4> )
6647 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6648 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6650 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6653 SIMDType xmm1, xmm2, xmm3, xmm4;
6657 const SIMDType a1( A.load(i ,k) );
6658 const SIMDType a2( A.load(i+1UL,k) );
6659 const SIMDType b1( B.load(k,j ) );
6660 const SIMDType b2( B.load(k,j+1UL) );
6667 C(i ,j ) +=
sum( xmm1 ) * scalar;
6668 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
6669 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
6670 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
6672 for( ; remainder && k<kend; ++k ) {
6673 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
6674 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
6675 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
6676 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
6682 const size_t kbegin( ( IsUpper_v<MT4> )
6683 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6684 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6685 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6687 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6690 SIMDType xmm1, xmm2;
6694 const SIMDType b1( B.load(k,j) );
6695 xmm1 += A.load(i ,k) * b1;
6696 xmm2 += A.load(i+1UL,k) * b1;
6699 C(i ,j) +=
sum( xmm1 ) * scalar;
6700 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
6702 for( ; remainder && k<kend; ++k ) {
6703 C(i ,j) += A(i ,k) * B(k,j) * scalar;
6704 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
6711 const size_t jend( LOW ? i+1UL : N );
6712 size_t j( UPP ? i : 0UL );
6714 for( ; (j+2UL) <= jend; j+=2UL )
6716 const size_t kbegin( ( IsUpper_v<MT4> )
6717 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6718 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6719 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6721 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
6724 SIMDType xmm1, xmm2;
6728 const SIMDType a1( A.load(i,k) );
6729 xmm1 += a1 * B.load(k,j );
6730 xmm2 += a1 * B.load(k,j+1UL);
6733 C(i,j ) +=
sum( xmm1 ) * scalar;
6734 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
6736 for( ; remainder && k<kend; ++k ) {
6737 C(i,j ) += A(i,k) * B(k,j ) * scalar;
6738 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
6744 const size_t kbegin( ( IsUpper_v<MT4> )
6745 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
6746 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
6748 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
6755 xmm1 += A.load(i,k) * B.load(k,j);
6758 C(i,j) +=
sum( xmm1 ) * scalar;
6760 for( ; remainder && k<K; ++k ) {
6761 C(i,j) += A(i,k) * B(k,j) * scalar;
6782 template<
typename MT3
6786 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6787 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6789 selectDefaultAddAssignKernel( C, A, B, scalar );
6808 template<
typename MT3
6812 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6813 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6816 lmmm( C, A, B, scalar, ST2(1) );
6818 ummm( C, A, B, scalar, ST2(1) );
6820 mmm( C, A, B, scalar, ST2(1) );
6838 template<
typename MT3
6842 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6843 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6845 selectLargeAddAssignKernel( C, A, B, scalar );
6850 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6864 template<
typename MT3
6868 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6869 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6871 using ET = ElementType_t<MT3>;
6873 if( IsTriangular_v<MT4> ) {
6874 ResultType_t<MT3> tmp(
serial( B ) );
6875 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6876 addAssign( C, tmp );
6878 else if( IsTriangular_v<MT5> ) {
6879 ResultType_t<MT3> tmp(
serial( A ) );
6880 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6881 addAssign( C, tmp );
6884 gemm( C, A, B,
ET(scalar),
ET(1) );
6906 template<
typename MT
6908 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6915 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6916 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6918 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6932 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
6947 template<
typename MT3
6951 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6953 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
6954 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
6955 selectSmallSubAssignKernel( C, A, B, scalar );
6957 selectBlasSubAssignKernel( C, A, B, scalar );
6975 template<
typename MT3
6979 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6980 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6983 subAssign( C, tmp );
7001 template<
typename MT3
7005 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7006 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7008 const size_t M( A.rows() );
7009 const size_t N( B.columns() );
7011 for(
size_t i=0UL; i<M; ++i )
7013 const size_t jbegin( ( IsUpper_v<MT4> )
7014 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7016 const size_t jend( ( IsLower_v<MT4> )
7017 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7021 const size_t jnum( jend - jbegin );
7022 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7024 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7025 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7026 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7029 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7049 template<
typename MT3
7053 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7054 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7056 constexpr
size_t block( BLOCK_SIZE );
7058 const size_t M( A.rows() );
7059 const size_t N( B.columns() );
7061 for(
size_t jj=0UL; jj<N; jj+=block ) {
7062 const size_t jend(
min( N, jj+block ) );
7063 for(
size_t ii=0UL; ii<M; ii+=block ) {
7064 const size_t iend(
min( M, ii+block ) );
7065 for(
size_t j=jj; j<jend; ++j )
7067 const size_t ibegin( ( IsLower_v<MT4> )
7068 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7070 const size_t ipos( ( IsUpper_v<MT4> )
7071 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7074 for(
size_t i=ibegin; i<ipos; ++i ) {
7075 C(i,j) -= A(i,j) * B(j,j) * scalar;
7098 template<
typename MT3
7102 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7103 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7105 constexpr
size_t block( BLOCK_SIZE );
7107 const size_t M( A.rows() );
7108 const size_t N( B.columns() );
7110 for(
size_t ii=0UL; ii<M; ii+=block ) {
7111 const size_t iend(
min( M, ii+block ) );
7112 for(
size_t jj=0UL; jj<N; jj+=block ) {
7113 const size_t jend(
min( N, jj+block ) );
7114 for(
size_t i=ii; i<iend; ++i )
7116 const size_t jbegin( ( IsUpper_v<MT5> )
7117 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7119 const size_t jpos( ( IsLower_v<MT5> )
7120 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7123 for(
size_t j=jbegin; j<jpos; ++j ) {
7124 C(i,j) -= A(i,i) * B(i,j) * scalar;
7147 template<
typename MT3
7151 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7152 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7154 const size_t M( A.rows() );
7155 const size_t N( B.columns() );
7157 for(
size_t j=0UL; j<N; ++j )
7159 const size_t ibegin( ( IsLower_v<MT5> )
7160 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7162 const size_t iend( ( IsUpper_v<MT5> )
7163 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7167 const size_t inum( iend - ibegin );
7168 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
7170 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
7171 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
7172 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
7175 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
7195 template<
typename MT3
7199 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7200 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7202 for(
size_t i=0UL; i<A.rows(); ++i ) {
7203 C(i,i) -= A(i,i) * B(i,i) * scalar;
7222 template<
typename MT3
7226 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7227 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7229 selectDefaultSubAssignKernel( C, A, B, scalar );
7248 template<
typename MT3
7252 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7253 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7255 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7257 const size_t M( A.rows() );
7258 const size_t N( B.columns() );
7259 const size_t K( A.columns() );
7265 for( ; (i+2UL) <= M; i+=2UL )
7267 const size_t jend( LOW ? i+2UL : N );
7268 size_t j( UPP ? i : 0UL );
7270 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7272 const size_t kbegin( ( IsUpper_v<MT4> )
7273 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7274 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7275 const size_t kend( ( IsLower_v<MT4> )
7276 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
7277 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7279 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7282 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7286 const SIMDType a1( A.load(i ,k) );
7287 const SIMDType a2( A.load(i+1UL,k) );
7288 const SIMDType b1( B.load(k,j ) );
7289 const SIMDType b2( B.load(k,j+1UL) );
7290 const SIMDType b3( B.load(k,j+2UL) );
7291 const SIMDType b4( B.load(k,j+3UL) );
7302 C(i ,j ) -=
sum( xmm1 ) * scalar;
7303 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7304 C(i ,j+2UL) -=
sum( xmm3 ) * scalar;
7305 C(i ,j+3UL) -=
sum( xmm4 ) * scalar;
7306 C(i+1UL,j ) -=
sum( xmm5 ) * scalar;
7307 C(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
7308 C(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
7309 C(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
7311 for( ; remainder && k<kend; ++k ) {
7312 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7313 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7314 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
7315 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
7316 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7317 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7318 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
7319 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
7323 for( ; (j+2UL) <= jend; j+=2UL )
7325 const size_t kbegin( ( IsUpper_v<MT4> )
7326 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7327 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7328 const size_t kend( ( IsLower_v<MT4> )
7329 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7330 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7332 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7335 SIMDType xmm1, xmm2, xmm3, xmm4;
7339 const SIMDType a1( A.load(i ,k) );
7340 const SIMDType a2( A.load(i+1UL,k) );
7341 const SIMDType b1( B.load(k,j ) );
7342 const SIMDType b2( B.load(k,j+1UL) );
7349 C(i ,j ) -=
sum( xmm1 ) * scalar;
7350 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7351 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7352 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7354 for( ; remainder && k<kend; ++k ) {
7355 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7356 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7357 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7358 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7364 const size_t kbegin( ( IsUpper_v<MT4> )
7365 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7366 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7367 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7369 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7372 SIMDType xmm1, xmm2;
7376 const SIMDType b1( B.load(k,j) );
7377 xmm1 += A.load(i ,k) * b1;
7378 xmm2 += A.load(i+1UL,k) * b1;
7381 C(i ,j) -=
sum( xmm1 ) * scalar;
7382 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7384 for( ; remainder && k<kend; ++k ) {
7385 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7386 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7393 const size_t jend( LOW ? i+1UL : N );
7394 size_t j( UPP ? i : 0UL );
7396 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
7398 const size_t kbegin( ( IsUpper_v<MT4> )
7399 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7400 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7401 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
7403 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7406 SIMDType xmm1, xmm2, xmm3, xmm4;
7410 const SIMDType a1( A.load(i,k) );
7411 xmm1 += a1 * B.load(k,j );
7412 xmm2 += a1 * B.load(k,j+1UL);
7413 xmm3 += a1 * B.load(k,j+2UL);
7414 xmm4 += a1 * B.load(k,j+3UL);
7417 C(i,j ) -=
sum( xmm1 ) * scalar;
7418 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7419 C(i,j+2UL) -=
sum( xmm3 ) * scalar;
7420 C(i,j+3UL) -=
sum( xmm4 ) * scalar;
7422 for( ; remainder && k<kend; ++k ) {
7423 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7424 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7425 C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
7426 C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
7430 for( ; (j+2UL) <= jend; j+=2UL )
7432 const size_t kbegin( ( IsUpper_v<MT4> )
7433 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7434 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7435 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7437 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7440 SIMDType xmm1, xmm2;
7444 const SIMDType a1( A.load(i,k) );
7445 xmm1 += a1 * B.load(k,j );
7446 xmm2 += a1 * B.load(k,j+1UL);
7449 C(i,j ) -=
sum( xmm1 ) * scalar;
7450 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7452 for( ; remainder && k<kend; ++k ) {
7453 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7454 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7460 const size_t kbegin( ( IsUpper_v<MT4> )
7461 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7462 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7464 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
7471 xmm1 += A.load(i,k) * B.load(k,j);
7474 C(i,j) -=
sum( xmm1 ) * scalar;
7476 for( ; remainder && k<K; ++k ) {
7477 C(i,j) -= A(i,k) * B(k,j) * scalar;
7499 template<
typename MT3
7503 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7504 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7506 constexpr
bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7508 const size_t M( A.rows() );
7509 const size_t N( B.columns() );
7510 const size_t K( A.columns() );
7516 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
7520 for( ; (j+2UL) <= N; j+=2UL )
7522 const size_t kbegin( ( IsUpper_v<MT4> )
7523 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7524 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7525 const size_t kend( ( IsLower_v<MT4> )
7526 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
7527 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7529 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7532 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7537 const SIMDType a1( A.load(i ,k) );
7538 const SIMDType a2( A.load(i+1UL,k) );
7539 const SIMDType a3( A.load(i+2UL,k) );
7540 const SIMDType a4( A.load(i+3UL,k) );
7541 const SIMDType b1( B.load(k,j ) );
7542 const SIMDType b2( B.load(k,j+1UL) );
7553 C(i ,j ) -=
sum( xmm1 ) * scalar;
7554 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7555 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7556 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7557 C(i+2UL,j ) -=
sum( xmm5 ) * scalar;
7558 C(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
7559 C(i+3UL,j ) -=
sum( xmm7 ) * scalar;
7560 C(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
7562 for( ; remainder && k<kend; ++k ) {
7563 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7564 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7565 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7566 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7567 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
7568 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
7569 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
7570 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
7576 const size_t kbegin( ( IsUpper_v<MT4> )
7577 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7578 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7579 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
7581 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7584 SIMDType xmm1, xmm2, xmm3, xmm4;
7588 const SIMDType b1( B.load(k,j) );
7589 xmm1 += A.load(i ,k) * b1;
7590 xmm2 += A.load(i+1UL,k) * b1;
7591 xmm3 += A.load(i+2UL,k) * b1;
7592 xmm4 += A.load(i+3UL,k) * b1;
7595 C(i ,j) -=
sum( xmm1 ) * scalar;
7596 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7597 C(i+2UL,j) -=
sum( xmm3 ) * scalar;
7598 C(i+3UL,j) -=
sum( xmm4 ) * scalar;
7600 for( ; remainder && k<kend; ++k ) {
7601 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7602 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7603 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
7604 C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
7609 for( ; (i+2UL) <= M; i+=2UL )
7611 const size_t jend( LOW ? i+2UL : N );
7612 size_t j( UPP ? i : 0UL );
7614 for( ; (j+2UL) <= jend; j+=2UL )
7616 const size_t kbegin( ( IsUpper_v<MT4> )
7617 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7618 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7619 const size_t kend( ( IsLower_v<MT4> )
7620 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7621 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7623 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7626 SIMDType xmm1, xmm2, xmm3, xmm4;
7630 const SIMDType a1( A.load(i ,k) );
7631 const SIMDType a2( A.load(i+1UL,k) );
7632 const SIMDType b1( B.load(k,j ) );
7633 const SIMDType b2( B.load(k,j+1UL) );
7640 C(i ,j ) -=
sum( xmm1 ) * scalar;
7641 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
7642 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
7643 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
7645 for( ; remainder && k<kend; ++k ) {
7646 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
7647 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
7648 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
7649 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
7655 const size_t kbegin( ( IsUpper_v<MT4> )
7656 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7657 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7658 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
7660 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7663 SIMDType xmm1, xmm2;
7667 const SIMDType b1( B.load(k,j) );
7668 xmm1 += A.load(i ,k) * b1;
7669 xmm2 += A.load(i+1UL,k) * b1;
7672 C(i ,j) -=
sum( xmm1 ) * scalar;
7673 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
7675 for( ; remainder && k<kend; ++k ) {
7676 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
7677 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
7684 const size_t jend( LOW ? i+1UL : N );
7685 size_t j( UPP ? i : 0UL );
7687 for( ; (j+2UL) <= jend; j+=2UL )
7689 const size_t kbegin( ( IsUpper_v<MT4> )
7690 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7691 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7692 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
7694 const size_t kpos( remainder ? ( kend &
size_t(-
SIMDSIZE) ) : kend );
7697 SIMDType xmm1, xmm2;
7701 const SIMDType a1( A.load(i,k) );
7702 xmm1 += a1 * B.load(k,j );
7703 xmm2 += a1 * B.load(k,j+1UL);
7706 C(i,j ) -=
sum( xmm1 ) * scalar;
7707 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
7709 for( ; remainder && k<kend; ++k ) {
7710 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
7711 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
7717 const size_t kbegin( ( IsUpper_v<MT4> )
7718 ?( ( IsLower_v<MT5> ?
max( i, j ) : i ) &
size_t(-
SIMDSIZE) )
7719 :( IsLower_v<MT5> ? ( j &
size_t(-
SIMDSIZE) ) : 0UL ) );
7721 const size_t kpos( remainder ? ( K &
size_t(-
SIMDSIZE) ) : K );
7728 xmm1 += A.load(i,k) * B.load(k,j);
7731 C(i,j) -=
sum( xmm1 ) * scalar;
7733 for( ; remainder && k<K; ++k ) {
7734 C(i,j) -= A(i,k) * B(k,j) * scalar;
7755 template<
typename MT3
7759 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7760 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7762 selectDefaultSubAssignKernel( C, A, B, scalar );
7781 template<
typename MT3
7785 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7786 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7789 lmmm( C, A, B, -scalar, ST2(1) );
7791 ummm( C, A, B, -scalar, ST2(1) );
7793 mmm( C, A, B, -scalar, ST2(1) );
7811 template<
typename MT3
7815 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7816 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7818 selectLargeSubAssignKernel( C, A, B, scalar );
7823 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7837 template<
typename MT3
7841 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7842 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7844 using ET = ElementType_t<MT3>;
7846 if( IsTriangular_v<MT4> ) {
7847 ResultType_t<MT3> tmp(
serial( B ) );
7848 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7849 subAssign( C, tmp );
7851 else if( IsTriangular_v<MT5> ) {
7852 ResultType_t<MT3> tmp(
serial( A ) );
7853 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7854 subAssign( C, tmp );
7857 gemm( C, A, B,
ET(-scalar),
ET(1) );
7879 template<
typename MT
7881 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7893 schurAssign( ~lhs, tmp );
7924 template<
typename MT
7927 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7934 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7935 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7937 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
7940 else if( left.columns() == 0UL ) {
7974 template<
typename MT
7977 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
7981 using TmpType = If_t< SO, OppositeType, ResultType >;
7993 const ForwardFunctor fwd;
7995 const TmpType tmp( rhs );
8015 template<
typename MT
8018 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8025 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8026 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8028 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8065 template<
typename MT
8068 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8075 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8076 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8078 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8112 template<
typename MT
8195 template<
typename MT1
8197 inline decltype(
auto)
8245 template<
typename MT1
8251 inline decltype(
auto)
declsym( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8259 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
8260 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8291 template<
typename MT1
8297 inline decltype(
auto)
declherm( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8305 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
8306 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8337 template<
typename MT1
8343 inline decltype(
auto)
decllow( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8351 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
8352 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8383 template<
typename MT1
8389 inline decltype(
auto)
declupp( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8397 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
8398 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8429 template<
typename MT1
8435 inline decltype(
auto)
decldiag( const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
8443 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
8444 return ReturnType( dm.leftOperand(), dm.rightOperand() );
8460 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8461 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
8462 :
public Size<MT1,0UL>
8465 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8466 struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
8467 :
public Size<MT2,1UL>
8483 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
8484 struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
8485 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:288
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:426
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:172
Header file for basic type definitions.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:170
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
Header file for the declherm trait.
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:271
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:532
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:374
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:522
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1002
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:596
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:158
Header file for the Computation base class.
If_t< IsExpression_v< MT1 >, const MT1, const MT1 &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:274
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:431
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:390
Header file for the IsComplexDouble type trait.
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:310
Constraint on the data type.
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:155
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:564
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
Generic wrapper for the decllow() function.
Definition: DeclLow.h:58
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:444
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2146
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:169
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:167
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1002
Header file for the IsLower type trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:301
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:280
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:432
Header file for the IsAligned type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:552
Generic wrapper for the null function.
Definition: Noop.h:59
Header file for the IsTriangular type trait.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:270
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:161
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:268
Header file for the exception macros of the math module.
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:477
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:604
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:468
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:454
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
Header file for the MatScalarMultExpr base class.
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:264
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:173
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:421
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:165
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:420
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:267
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:58
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:576
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:295
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:410
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:58
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:283
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:269
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:325
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:171
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:453
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:440
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:58
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:156
Expression object for dense matrix-transpose dense matrix multiplications.The DMatTDMatMultExpr class...
Definition: DMatTDMatMultExpr.h:144
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:586
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
Header file for the IsUpper type trait.
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1326
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:58
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:400
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:542
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:170
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the DeclSym functor.
If_t< IsExpression_v< MT2 >, const MT2, const MT2 &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:277
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:464
Header file for the TrueType type/value trait base class.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:160
Header file for the IsExpression type trait class.
Header file for the function trace functionality.