35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATTDMATMULTEXPR_H_
137template<
typename MT1
144 :
public MatMatMultExpr< DenseMatrix< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
159 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
164 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
168 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
169 static constexpr bool HERM = ( HF && !( LF || UF ) );
170 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
171 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
180 template<
typename T1,
typename T2,
typename T3 >
190 template<
typename T1,
typename T2,
typename T3 >
191 static constexpr bool UseBlasKernel_v =
194 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
195 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
196 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
197 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
198 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
199 IsBLASCompatible_v< ElementType_t<T1> > &&
200 IsBLASCompatible_v< ElementType_t<T2> > &&
201 IsBLASCompatible_v< ElementType_t<T3> > &&
212 template<
typename T1,
typename T2,
typename T3 >
213 static constexpr bool UseVectorizedDefaultKernel_v =
214 ( useOptimizedKernels &&
215 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
216 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
217 IsSIMDCombinable_v< ElementType_t<T1>
288 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
289 MT1::simdEnabled && MT2::simdEnabled &&
290 HasSIMDAdd_v<ET1,ET2> &&
291 HasSIMDMult_v<ET1,ET2> );
328 if( IsDiagonal_v<MT1> ) {
331 else if( IsDiagonal_v<MT2> ) {
334 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
335 const size_t begin( ( IsUpper_v<MT1> )
336 ?( ( IsLower_v<MT2> )
337 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
338 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
339 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
340 :( ( IsLower_v<MT2> )
341 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
343 const size_t end( ( IsLower_v<MT1> )
344 ?( ( IsUpper_v<MT2> )
345 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
346 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
347 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
348 :( ( IsUpper_v<MT2> )
349 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
350 :(
lhs_.columns() ) ) );
374 if( i >=
lhs_.rows() ) {
377 if( j >=
rhs_.columns() ) {
389 inline size_t rows() const noexcept {
400 return rhs_.columns();
430 template<
typename T >
431 inline bool canAlias(
const T* alias )
const noexcept {
432 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
442 template<
typename T >
443 inline bool isAliased(
const T* alias )
const noexcept {
444 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
454 return lhs_.isAligned() &&
rhs_.isAligned();
465 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
467 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
468 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD ) &&
469 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
492 template<
typename MT
501 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
504 else if( rhs.
lhs_.columns() == 0UL ) {
519 DMatTDMatMultExpr::selectAssignKernel( *lhs, A, B );
535 template<
typename MT3
538 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
540 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
541 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
542 selectSmallAssignKernel( C, A, B );
544 selectBlasAssignKernel( C, A, B );
563 template<
typename MT3
566 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
567 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
569 const size_t M( A.rows() );
570 const size_t N( B.columns() );
571 const size_t K( A.columns() );
575 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
576 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
578 const size_t iend( ( IsStrictlyUpper_v<MT4> )
579 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
583 for(
size_t i=0UL; i<ibegin; ++i ) {
584 for(
size_t j=0UL; j<N; ++j ) {
588 for(
size_t i=ibegin; i<iend; ++i )
590 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
591 ?( ( IsStrictlyUpper_v<MT4> )
592 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
593 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
594 :( ( IsStrictlyUpper_v<MT5> )
597 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
598 ?( ( IsStrictlyLower_v<MT4> )
599 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
600 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
601 :( ( IsStrictlyLower_v<MT5> )
602 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
603 :(
LOW ? i+1UL : N ) ) );
606 for(
size_t j=0UL; j<N; ++j ) {
614 for(
size_t j=(
SYM ||
HERM ? i : 0UL ); j<jbegin; ++j ) {
617 for(
size_t j=jbegin; j<jend; ++j )
619 const size_t kbegin( ( IsUpper_v<MT4> )
620 ?( ( IsLower_v<MT5> )
621 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
622 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
623 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
624 :( ( IsLower_v<MT5> )
625 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
627 const size_t kend( ( IsLower_v<MT4> )
628 ?( ( IsUpper_v<MT5> )
629 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
630 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
631 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
632 :( ( IsUpper_v<MT5> )
633 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
638 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
639 C(i,j) += A(i,k) * B(k,j);
642 for(
size_t j=jend; j<N; ++j ) {
646 for(
size_t i=iend; i<M; ++i ) {
647 for(
size_t j=0UL; j<N; ++j ) {
653 for(
size_t i=1UL; i<M; ++i ) {
654 for(
size_t j=0UL; j<i; ++j ) {
655 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
677 template<
typename MT3
680 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
681 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
683 const size_t M( A.rows() );
684 const size_t N( B.columns() );
685 const size_t K( A.columns() );
689 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
690 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
692 const size_t jend( ( IsStrictlyLower_v<MT5> )
693 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
697 for(
size_t j=0UL; j<jbegin; ++j ) {
698 for(
size_t i=0UL; i<M; ++i ) {
702 for(
size_t j=jbegin; j<jend; ++j )
704 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
705 ?( ( IsStrictlyLower_v<MT4> )
706 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
707 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
708 :( ( IsStrictlyLower_v<MT4> )
711 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
712 ?( ( IsStrictlyUpper_v<MT4> )
713 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
714 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
715 :( ( IsStrictlyUpper_v<MT4> )
716 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
717 :(
UPP ? j+1UL : M ) ) );
720 for(
size_t i=0UL; i<M; ++i ) {
728 for(
size_t i=(
SYM ||
HERM ? j : 0UL ); i<ibegin; ++i ) {
731 for(
size_t i=ibegin; i<iend; ++i )
733 const size_t kbegin( ( IsUpper_v<MT4> )
734 ?( ( IsLower_v<MT5> )
735 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
736 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
737 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
738 :( ( IsLower_v<MT5> )
739 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
741 const size_t kend( ( IsLower_v<MT4> )
742 ?( ( IsUpper_v<MT5> )
743 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
744 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
745 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
746 :( ( IsUpper_v<MT5> )
747 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
751 C(i,j) = A(i,kbegin) * B(kbegin,j);
752 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
753 C(i,j) += A(i,k) * B(k,j);
756 for(
size_t i=iend; i<M; ++i ) {
760 for(
size_t j=jend; j<N; ++j ) {
761 for(
size_t i=0UL; i<M; ++i ) {
767 for(
size_t j=1UL; j<N; ++j ) {
768 for(
size_t i=0UL; i<j; ++i ) {
769 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
791 template<
typename MT3
794 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
795 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
797 const size_t M( A.rows() );
798 const size_t N( B.columns() );
800 for(
size_t i=0UL; i<M; ++i )
802 const size_t jbegin( ( IsUpper_v<MT4> )
803 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
805 const size_t jend( ( IsLower_v<MT4> )
806 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
810 if( IsUpper_v<MT4> ) {
811 for(
size_t j=0UL; j<jbegin; ++j ) {
815 for(
size_t j=jbegin; j<jend; ++j ) {
816 C(i,j) = A(i,j) * B(j,j);
818 if( IsLower_v<MT4> ) {
819 for(
size_t j=jend; j<N; ++j ) {
842 template<
typename MT3
845 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
846 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
848 constexpr size_t block( BLOCK_SIZE );
850 const size_t M( A.rows() );
851 const size_t N( B.columns() );
853 for(
size_t jj=0UL; jj<N; jj+=block ) {
854 const size_t jend(
min( N, jj+block ) );
855 for(
size_t ii=0UL; ii<M; ii+=block ) {
856 const size_t iend(
min( M, ii+block ) );
857 for(
size_t j=jj; j<jend; ++j )
859 const size_t ibegin( ( IsLower_v<MT4> )
860 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
862 const size_t ipos( ( IsUpper_v<MT4> )
863 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
866 if( IsLower_v<MT4> ) {
867 for(
size_t i=ii; i<ibegin; ++i ) {
871 for(
size_t i=ibegin; i<ipos; ++i ) {
872 C(i,j) = A(i,j) * B(j,j);
874 if( IsUpper_v<MT4> ) {
875 for(
size_t i=ipos; i<iend; ++i ) {
900 template<
typename MT3
903 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
904 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
906 constexpr size_t block( BLOCK_SIZE );
908 const size_t M( A.rows() );
909 const size_t N( B.columns() );
911 for(
size_t ii=0UL; ii<M; ii+=block ) {
912 const size_t iend(
min( M, ii+block ) );
913 for(
size_t jj=0UL; jj<N; jj+=block ) {
914 const size_t jend(
min( N, jj+block ) );
915 for(
size_t i=ii; i<iend; ++i )
917 const size_t jbegin( ( IsUpper_v<MT5> )
918 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
920 const size_t jpos( ( IsLower_v<MT5> )
921 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
924 if( IsUpper_v<MT5> ) {
925 for(
size_t j=jj; j<jbegin; ++j ) {
929 for(
size_t j=jbegin; j<jpos; ++j ) {
930 C(i,j) = A(i,i) * B(i,j);
932 if( IsLower_v<MT5> ) {
933 for(
size_t j=jpos; j<jend; ++j ) {
958 template<
typename MT3
961 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
962 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
964 const size_t M( A.rows() );
965 const size_t N( B.columns() );
967 for(
size_t j=0UL; j<N; ++j )
969 const size_t ibegin( ( IsLower_v<MT5> )
970 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
972 const size_t iend( ( IsUpper_v<MT5> )
973 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
977 if( IsLower_v<MT5> ) {
978 for(
size_t i=0UL; i<ibegin; ++i ) {
982 for(
size_t i=ibegin; i<iend; ++i ) {
983 C(i,j) = A(i,i) * B(i,j);
985 if( IsUpper_v<MT5> ) {
986 for(
size_t i=iend; i<M; ++i ) {
1009 template<
typename MT3
1012 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1013 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
1017 for(
size_t i=0UL; i<A.rows(); ++i ) {
1018 C(i,i) = A(i,i) * B(i,i);
1038 template<
typename MT3
1041 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1042 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1044 selectDefaultAssignKernel( C, A, B );
1064 template<
typename MT3
1067 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1068 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1070 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1072 const size_t M( A.rows() );
1073 const size_t N( B.columns() );
1074 const size_t K( A.columns() );
1080 for( ; !(
LOW &&
UPP ) && (i+3UL) <= M; i+=3UL )
1082 const size_t jend(
LOW ? i+3UL : N );
1087 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1088 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1089 C(i+2UL,j) =
HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
1095 reset( C(i+1UL,j) );
1096 reset( C(i+2UL,j) );
1100 for( ; (j+3UL) <= jend; j+=3UL )
1102 const size_t kbegin( ( IsUpper_v<MT4> )
1105 const size_t kend( ( IsLower_v<MT4> )
1106 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
1107 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
1134 a2 = A.load(i+1UL,k);
1135 a3 = A.load(i+2UL,k);
1137 b2 = B.load(k,j+1UL);
1138 b3 = B.load(k,j+2UL);
1150 C(i ,j ) =
sum( xmm1 );
1151 C(i ,j+1UL) =
sum( xmm2 );
1152 C(i ,j+2UL) =
sum( xmm3 );
1153 C(i+1UL,j ) =
sum( xmm4 );
1154 C(i+1UL,j+1UL) =
sum( xmm5 );
1155 C(i+1UL,j+2UL) =
sum( xmm6 );
1156 C(i+2UL,j ) =
sum( xmm7 );
1157 C(i+2UL,j+1UL) =
sum( xmm8 );
1158 C(i+2UL,j+2UL) =
sum( xmm9 );
1160 for( ; remainder && k<kend; ++k ) {
1161 C(i ,j ) += A(i ,k) * B(k,j );
1162 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1163 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1164 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1165 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1166 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1167 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1168 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1169 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
1184 for( ++k; k<kend; ++k ) {
1185 value1 += A(i ,k) * B(k,j );
1186 value2 += A(i ,k) * B(k,j+1UL);
1187 value3 += A(i ,k) * B(k,j+2UL);
1188 value4 += A(i+1UL,k) * B(k,j );
1189 value5 += A(i+1UL,k) * B(k,j+1UL);
1190 value6 += A(i+1UL,k) * B(k,j+2UL);
1191 value7 += A(i+2UL,k) * B(k,j );
1192 value8 += A(i+2UL,k) * B(k,j+1UL);
1193 value9 += A(i+2UL,k) * B(k,j+2UL);
1197 C(i ,j+1UL) = value2;
1198 C(i ,j+2UL) = value3;
1199 C(i+1UL,j ) = value4;
1200 C(i+1UL,j+1UL) = value5;
1201 C(i+1UL,j+2UL) = value6;
1202 C(i+2UL,j ) = value7;
1203 C(i+2UL,j+1UL) = value8;
1204 C(i+2UL,j+2UL) = value9;
1209 reset( C(i ,j+1UL) );
1210 reset( C(i ,j+2UL) );
1211 reset( C(i+1UL,j ) );
1212 reset( C(i+1UL,j+1UL) );
1213 reset( C(i+1UL,j+2UL) );
1214 reset( C(i+2UL,j ) );
1215 reset( C(i+2UL,j+1UL) );
1216 reset( C(i+2UL,j+2UL) );
1220 for( ; (j+2UL) <= jend; j+=2UL )
1222 const size_t kbegin( ( IsUpper_v<MT4> )
1225 const size_t kend( ( IsLower_v<MT4> )
1226 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
1227 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1250 a2 = A.load(i+1UL,k);
1251 a3 = A.load(i+2UL,k);
1253 b2 = B.load(k,j+1UL);
1262 C(i ,j ) =
sum( xmm1 );
1263 C(i ,j+1UL) =
sum( xmm2 );
1264 C(i+1UL,j ) =
sum( xmm3 );
1265 C(i+1UL,j+1UL) =
sum( xmm4 );
1266 C(i+2UL,j ) =
sum( xmm5 );
1267 C(i+2UL,j+1UL) =
sum( xmm6 );
1269 for( ; remainder && k<kend; ++k ) {
1270 C(i ,j ) += A(i ,k) * B(k,j );
1271 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1272 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1273 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1274 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1275 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1287 for( ++k; k<kend; ++k ) {
1288 value1 += A(i ,k) * B(k,j );
1289 value2 += A(i ,k) * B(k,j+1UL);
1290 value3 += A(i+1UL,k) * B(k,j );
1291 value4 += A(i+1UL,k) * B(k,j+1UL);
1292 value5 += A(i+2UL,k) * B(k,j );
1293 value6 += A(i+2UL,k) * B(k,j+1UL);
1297 C(i ,j+1UL) = value2;
1298 C(i+1UL,j ) = value3;
1299 C(i+1UL,j+1UL) = value4;
1300 C(i+2UL,j ) = value5;
1301 C(i+2UL,j+1UL) = value6;
1306 reset( C(i ,j+1UL) );
1307 reset( C(i+1UL,j ) );
1308 reset( C(i+1UL,j+1UL) );
1309 reset( C(i+2UL,j ) );
1310 reset( C(i+2UL,j+1UL) );
1316 const size_t kbegin( ( IsUpper_v<MT4> )
1319 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
1329 SIMDType xmm1( A.load(i ,k) * b1 );
1330 SIMDType xmm2( A.load(i+1UL,k) * b1 );
1331 SIMDType xmm3( A.load(i+2UL,k) * b1 );
1335 xmm1 += A.load(i ,k) * b1;
1336 xmm2 += A.load(i+1UL,k) * b1;
1337 xmm3 += A.load(i+2UL,k) * b1;
1340 C(i ,j) =
sum( xmm1 );
1341 C(i+1UL,j) =
sum( xmm2 );
1342 C(i+2UL,j) =
sum( xmm3 );
1344 for( ; remainder && k<kend; ++k ) {
1345 C(i ,j) += A(i ,k) * B(k,j);
1346 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1347 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
1356 for( ++k; k<kend; ++k ) {
1357 value1 += A(i ,k) * B(k,j);
1358 value2 += A(i+1UL,k) * B(k,j);
1359 value3 += A(i+2UL,k) * B(k,j);
1363 C(i+1UL,j) = value2;
1364 C(i+2UL,j) = value3;
1369 reset( C(i+1UL,j) );
1370 reset( C(i+2UL,j) );
1379 reset( C(i+1UL,j) );
1380 reset( C(i+2UL,j) );
1385 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
1387 const size_t jend(
LOW ? i+2UL : N );
1392 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1393 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1399 reset( C(i+1UL,j) );
1403 for( ; (j+4UL) <= jend; j+=4UL )
1405 const size_t kbegin( ( IsUpper_v<MT4> )
1408 const size_t kend( ( IsLower_v<MT4> )
1409 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
1410 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
1436 a2 = A.load(i+1UL,k);
1438 b2 = B.load(k,j+1UL);
1439 b3 = B.load(k,j+2UL);
1440 b4 = B.load(k,j+3UL);
1451 C(i ,j ) =
sum( xmm1 );
1452 C(i ,j+1UL) =
sum( xmm2 );
1453 C(i ,j+2UL) =
sum( xmm3 );
1454 C(i ,j+3UL) =
sum( xmm4 );
1455 C(i+1UL,j ) =
sum( xmm5 );
1456 C(i+1UL,j+1UL) =
sum( xmm6 );
1457 C(i+1UL,j+2UL) =
sum( xmm7 );
1458 C(i+1UL,j+3UL) =
sum( xmm8 );
1460 for( ; remainder && k<kend; ++k ) {
1461 C(i ,j ) += A(i ,k) * B(k,j );
1462 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1463 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
1464 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
1465 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1466 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1467 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
1468 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
1482 for( ++k; k<kend; ++k ) {
1483 value1 += A(i ,k) * B(k,j );
1484 value2 += A(i ,k) * B(k,j+1UL);
1485 value3 += A(i ,k) * B(k,j+2UL);
1486 value4 += A(i ,k) * B(k,j+3UL);
1487 value5 += A(i+1UL,k) * B(k,j );
1488 value6 += A(i+1UL,k) * B(k,j+1UL);
1489 value7 += A(i+1UL,k) * B(k,j+2UL);
1490 value8 += A(i+1UL,k) * B(k,j+3UL);
1494 C(i ,j+1UL) = value2;
1495 C(i ,j+2UL) = value3;
1496 C(i ,j+3UL) = value4;
1497 C(i+1UL,j ) = value5;
1498 C(i+1UL,j+1UL) = value6;
1499 C(i+1UL,j+2UL) = value7;
1500 C(i+1UL,j+3UL) = value8;
1505 reset( C(i ,j+1UL) );
1506 reset( C(i ,j+2UL) );
1507 reset( C(i ,j+3UL) );
1508 reset( C(i+1UL,j ) );
1509 reset( C(i+1UL,j+1UL) );
1510 reset( C(i+1UL,j+2UL) );
1511 reset( C(i+1UL,j+3UL) );
1515 for( ; (j+2UL) <= jend; j+=2UL )
1517 const size_t kbegin( ( IsUpper_v<MT4> )
1520 const size_t kend( ( IsLower_v<MT4> )
1521 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
1522 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1542 a2 = A.load(i+1UL,k);
1544 b2 = B.load(k,j+1UL);
1551 C(i ,j ) =
sum( xmm1 );
1552 C(i ,j+1UL) =
sum( xmm2 );
1553 C(i+1UL,j ) =
sum( xmm3 );
1554 C(i+1UL,j+1UL) =
sum( xmm4 );
1556 for( ; remainder && k<kend; ++k ) {
1557 C(i ,j ) += A(i ,k) * B(k,j );
1558 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1559 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1560 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1570 for( ++k; k<kend; ++k ) {
1571 value1 += A(i ,k) * B(k,j );
1572 value2 += A(i ,k) * B(k,j+1UL);
1573 value3 += A(i+1UL,k) * B(k,j );
1574 value4 += A(i+1UL,k) * B(k,j+1UL);
1578 C(i ,j+1UL) = value2;
1579 C(i+1UL,j ) = value3;
1580 C(i+1UL,j+1UL) = value4;
1585 reset( C(i ,j+1UL) );
1586 reset( C(i+1UL,j ) );
1587 reset( C(i+1UL,j+1UL) );
1593 const size_t kbegin( ( IsUpper_v<MT4> )
1596 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
1606 SIMDType xmm1( A.load(i ,k) * b1 );
1607 SIMDType xmm2( A.load(i+1UL,k) * b1 );
1611 xmm1 += A.load(i ,k) * b1;
1612 xmm2 += A.load(i+1UL,k) * b1;
1615 C(i ,j) =
sum( xmm1 );
1616 C(i+1UL,j) =
sum( xmm2 );
1618 for( ; remainder && k<kend; ++k ) {
1619 C(i ,j) += A(i ,k) * B(k,j);
1620 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
1628 for( ++k; k<kend; ++k ) {
1629 value1 += A(i ,k) * B(k,j);
1630 value2 += A(i+1UL,k) * B(k,j);
1634 C(i+1UL,j) = value2;
1639 reset( C(i+1UL,j) );
1648 reset( C(i+1UL,j) );
1655 const size_t jend(
LOW ? i+1UL : N );
1660 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1669 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
1671 const size_t kbegin( ( IsUpper_v<MT4> )
1674 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
1684 SIMDType xmm1( a1 * B.load(k,j ) );
1685 SIMDType xmm2( a1 * B.load(k,j+1UL) );
1686 SIMDType xmm3( a1 * B.load(k,j+2UL) );
1687 SIMDType xmm4( a1 * B.load(k,j+3UL) );
1691 xmm1 += a1 * B.load(k,j );
1692 xmm2 += a1 * B.load(k,j+1UL);
1693 xmm3 += a1 * B.load(k,j+2UL);
1694 xmm4 += a1 * B.load(k,j+3UL);
1697 C(i,j ) =
sum( xmm1 );
1698 C(i,j+1UL) =
sum( xmm2 );
1699 C(i,j+2UL) =
sum( xmm3 );
1700 C(i,j+3UL) =
sum( xmm4 );
1702 for( ; remainder && k<kend; ++k ) {
1703 C(i,j ) += A(i,k) * B(k,j );
1704 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1705 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
1706 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
1716 for( ++k; k<kend; ++k ) {
1717 value1 += A(i,k) * B(k,j );
1718 value2 += A(i,k) * B(k,j+1UL);
1719 value3 += A(i,k) * B(k,j+2UL);
1720 value4 += A(i,k) * B(k,j+3UL);
1724 C(i,j+1UL) = value2;
1725 C(i,j+2UL) = value3;
1726 C(i,j+3UL) = value4;
1731 reset( C(i,j+1UL) );
1732 reset( C(i,j+2UL) );
1733 reset( C(i,j+3UL) );
1737 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
1739 const size_t kbegin( ( IsUpper_v<MT4> )
1742 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
1752 SIMDType xmm1( a1 * B.load(k,j ) );
1753 SIMDType xmm2( a1 * B.load(k,j+1UL) );
1757 xmm1 += a1 * B.load(k,j );
1758 xmm2 += a1 * B.load(k,j+1UL);
1761 C(i,j ) =
sum( xmm1 );
1762 C(i,j+1UL) =
sum( xmm2 );
1764 for( ; remainder && k<kend; ++k ) {
1765 C(i,j ) += A(i,k) * B(k,j );
1766 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1774 for( ++k; k<kend; ++k ) {
1775 value1 += A(i,k) * B(k,j );
1776 value2 += A(i,k) * B(k,j+1UL);
1780 C(i,j+1UL) = value2;
1785 reset( C(i,j+1UL) );
1791 const size_t kbegin( ( IsUpper_v<MT4> )
1802 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
1805 xmm1 += A.load(i,k) * B.load(k,j);
1808 C(i,j) =
sum( xmm1 );
1810 for( ; remainder && k<K; ++k ) {
1811 C(i,j) += A(i,k) * B(k,j);
1818 for( ++k; k<K; ++k ) {
1819 value += A(i,k) * B(k,j);
1857 template<
typename MT3
1860 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1861 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1863 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
1865 const size_t M( A.rows() );
1866 const size_t N( B.columns() );
1867 const size_t K( A.columns() );
1873 for( ; !(
LOW &&
UPP ) && (i+4UL) <= M; i+=4UL )
1875 const size_t jend(
LOW ? i+4UL : N );
1880 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
1881 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
1882 C(i+2UL,j) =
HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
1883 C(i+3UL,j) =
HERM ?
conj( C(j,i+3UL) ) : C(j,i+3UL);
1889 reset( C(i+1UL,j) );
1890 reset( C(i+2UL,j) );
1891 reset( C(i+3UL,j) );
1895 for( ; (j+2UL) <= jend; j+=2UL )
1897 const size_t kbegin( ( IsUpper_v<MT4> )
1900 const size_t kend( ( IsLower_v<MT4> )
1901 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
1902 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
1928 a2 = A.load(i+1UL,k);
1929 a3 = A.load(i+2UL,k);
1930 a4 = A.load(i+3UL,k);
1932 b2 = B.load(k,j+1UL);
1943 C(i ,j ) =
sum( xmm1 );
1944 C(i ,j+1UL) =
sum( xmm2 );
1945 C(i+1UL,j ) =
sum( xmm3 );
1946 C(i+1UL,j+1UL) =
sum( xmm4 );
1947 C(i+2UL,j ) =
sum( xmm5 );
1948 C(i+2UL,j+1UL) =
sum( xmm6 );
1949 C(i+3UL,j ) =
sum( xmm7 );
1950 C(i+3UL,j+1UL) =
sum( xmm8 );
1952 for( ; remainder && k<kend; ++k ) {
1953 C(i ,j ) += A(i ,k) * B(k,j );
1954 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
1955 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
1956 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
1957 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
1958 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
1959 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
1960 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
1974 for( ++k; k<kend; ++k ) {
1975 value1 += A(i ,k) * B(k,j );
1976 value2 += A(i ,k) * B(k,j+1UL);
1977 value3 += A(i+1UL,k) * B(k,j );
1978 value4 += A(i+1UL,k) * B(k,j+1UL);
1979 value5 += A(i+2UL,k) * B(k,j );
1980 value6 += A(i+2UL,k) * B(k,j+1UL);
1981 value7 += A(i+3UL,k) * B(k,j );
1982 value8 += A(i+3UL,k) * B(k,j+1UL);
1986 C(i ,j+1UL) = value2;
1987 C(i+1UL,j ) = value3;
1988 C(i+1UL,j+1UL) = value4;
1989 C(i+2UL,j ) = value5;
1990 C(i+2UL,j+1UL) = value6;
1991 C(i+3UL,j ) = value7;
1992 C(i+3UL,j+1UL) = value8;
1997 reset( C(i ,j+1UL) );
1998 reset( C(i+1UL,j ) );
1999 reset( C(i+1UL,j+1UL) );
2000 reset( C(i+2UL,j ) );
2001 reset( C(i+2UL,j+1UL) );
2002 reset( C(i+3UL,j ) );
2003 reset( C(i+3UL,j+1UL) );
2009 const size_t kbegin( ( IsUpper_v<MT4> )
2012 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
2022 SIMDType xmm1( A.load(i ,k) * b1 );
2023 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2024 SIMDType xmm3( A.load(i+2UL,k) * b1 );
2025 SIMDType xmm4( A.load(i+3UL,k) * b1 );
2029 xmm1 += A.load(i ,k) * b1;
2030 xmm2 += A.load(i+1UL,k) * b1;
2031 xmm3 += A.load(i+2UL,k) * b1;
2032 xmm4 += A.load(i+3UL,k) * b1;
2035 C(i ,j) =
sum( xmm1 );
2036 C(i+1UL,j) =
sum( xmm2 );
2037 C(i+2UL,j) =
sum( xmm3 );
2038 C(i+3UL,j) =
sum( xmm4 );
2040 for( ; remainder && k<kend; ++k ) {
2041 C(i ,j) += A(i ,k) * B(k,j);
2042 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2043 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2044 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
2054 for( ++k; k<kend; ++k ) {
2055 value1 += A(i ,k) * B(k,j);
2056 value2 += A(i+1UL,k) * B(k,j);
2057 value3 += A(i+2UL,k) * B(k,j);
2058 value4 += A(i+3UL,k) * B(k,j);
2062 C(i+1UL,j) = value2;
2063 C(i+2UL,j) = value3;
2064 C(i+3UL,j) = value4;
2069 reset( C(i+1UL,j) );
2070 reset( C(i+2UL,j) );
2071 reset( C(i+3UL,j) );
2080 reset( C(i+1UL,j) );
2081 reset( C(i+2UL,j) );
2082 reset( C(i+3UL,j) );
2087 for( ; !(
LOW &&
UPP ) && (i+3UL) <= M; i+=3UL )
2089 const size_t jend(
LOW ? i+3UL : N );
2094 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
2095 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
2096 C(i+2UL,j) =
HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
2102 reset( C(i+1UL,j) );
2103 reset( C(i+2UL,j) );
2107 for( ; (j+3UL) <= jend; j+=3UL )
2109 const size_t kbegin( ( IsUpper_v<MT4> )
2112 const size_t kend( ( IsLower_v<MT4> )
2113 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
2114 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
2141 a2 = A.load(i+1UL,k);
2142 a3 = A.load(i+2UL,k);
2144 b2 = B.load(k,j+1UL);
2145 b3 = B.load(k,j+2UL);
2157 C(i ,j ) =
sum( xmm1 );
2158 C(i ,j+1UL) =
sum( xmm2 );
2159 C(i ,j+2UL) =
sum( xmm3 );
2160 C(i+1UL,j ) =
sum( xmm4 );
2161 C(i+1UL,j+1UL) =
sum( xmm5 );
2162 C(i+1UL,j+2UL) =
sum( xmm6 );
2163 C(i+2UL,j ) =
sum( xmm7 );
2164 C(i+2UL,j+1UL) =
sum( xmm8 );
2165 C(i+2UL,j+2UL) =
sum( xmm9 );
2167 for( ; remainder && k<kend; ++k ) {
2168 C(i ,j ) += A(i ,k) * B(k,j );
2169 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2170 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
2171 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2172 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2173 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
2174 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2175 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2176 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
2191 for( ++k; k<kend; ++k ) {
2192 value1 += A(i ,k) * B(k,j );
2193 value2 += A(i ,k) * B(k,j+1UL);
2194 value3 += A(i ,k) * B(k,j+2UL);
2195 value4 += A(i+1UL,k) * B(k,j );
2196 value5 += A(i+1UL,k) * B(k,j+1UL);
2197 value6 += A(i+1UL,k) * B(k,j+2UL);
2198 value7 += A(i+2UL,k) * B(k,j );
2199 value8 += A(i+2UL,k) * B(k,j+1UL);
2200 value9 += A(i+2UL,k) * B(k,j+2UL);
2204 C(i ,j+1UL) = value2;
2205 C(i ,j+2UL) = value3;
2206 C(i+1UL,j ) = value4;
2207 C(i+1UL,j+1UL) = value5;
2208 C(i+1UL,j+2UL) = value6;
2209 C(i+2UL,j ) = value7;
2210 C(i+2UL,j+1UL) = value8;
2211 C(i+2UL,j+2UL) = value9;
2216 reset( C(i ,j+1UL) );
2217 reset( C(i ,j+2UL) );
2218 reset( C(i+1UL,j ) );
2219 reset( C(i+1UL,j+1UL) );
2220 reset( C(i+1UL,j+2UL) );
2221 reset( C(i+2UL,j ) );
2222 reset( C(i+2UL,j+1UL) );
2223 reset( C(i+2UL,j+2UL) );
2227 for( ; (j+2UL) <= jend; j+=2UL )
2229 const size_t kbegin( ( IsUpper_v<MT4> )
2232 const size_t kend( ( IsLower_v<MT4> )
2233 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
2234 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2257 a2 = A.load(i+1UL,k);
2258 a3 = A.load(i+2UL,k);
2260 b2 = B.load(k,j+1UL);
2269 C(i ,j ) =
sum( xmm1 );
2270 C(i ,j+1UL) =
sum( xmm2 );
2271 C(i+1UL,j ) =
sum( xmm3 );
2272 C(i+1UL,j+1UL) =
sum( xmm4 );
2273 C(i+2UL,j ) =
sum( xmm5 );
2274 C(i+2UL,j+1UL) =
sum( xmm6 );
2276 for( ; remainder && k<kend; ++k ) {
2277 C(i ,j ) += A(i ,k) * B(k,j );
2278 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2279 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2280 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2281 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
2282 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
2294 for( ++k; k<kend; ++k ) {
2295 value1 += A(i ,k) * B(k,j );
2296 value2 += A(i ,k) * B(k,j+1UL);
2297 value3 += A(i+1UL,k) * B(k,j );
2298 value4 += A(i+1UL,k) * B(k,j+1UL);
2299 value5 += A(i+2UL,k) * B(k,j );
2300 value6 += A(i+2UL,k) * B(k,j+1UL);
2304 C(i ,j+1UL) = value2;
2305 C(i+1UL,j ) = value3;
2306 C(i+1UL,j+1UL) = value4;
2307 C(i+2UL,j ) = value5;
2308 C(i+2UL,j+1UL) = value6;
2313 reset( C(i ,j+1UL) );
2314 reset( C(i+1UL,j ) );
2315 reset( C(i+1UL,j+1UL) );
2316 reset( C(i+2UL,j ) );
2317 reset( C(i+2UL,j+1UL) );
2323 const size_t kbegin( ( IsUpper_v<MT4> )
2326 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
2336 SIMDType xmm1( A.load(i ,k) * b1 );
2337 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2338 SIMDType xmm3( A.load(i+2UL,k) * b1 );
2342 xmm1 += A.load(i ,k) * b1;
2343 xmm2 += A.load(i+1UL,k) * b1;
2344 xmm3 += A.load(i+2UL,k) * b1;
2347 C(i ,j) =
sum( xmm1 );
2348 C(i+1UL,j) =
sum( xmm2 );
2349 C(i+2UL,j) =
sum( xmm3 );
2351 for( ; remainder && k<kend; ++k ) {
2352 C(i ,j) += A(i ,k) * B(k,j);
2353 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2354 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
2363 for( ++k; k<kend; ++k ) {
2364 value1 += A(i ,k) * B(k,j);
2365 value2 += A(i+1UL,k) * B(k,j);
2366 value3 += A(i+2UL,k) * B(k,j);
2370 C(i+1UL,j) = value2;
2371 C(i+2UL,j) = value3;
2376 reset( C(i+1UL,j) );
2377 reset( C(i+2UL,j) );
2386 reset( C(i+1UL,j) );
2387 reset( C(i+2UL,j) );
2392 for( ; (i+2UL) <= M; i+=2UL )
2394 const size_t jend(
LOW ? i+2UL : N );
2399 C(i ,j) =
HERM ?
conj( C(j,i ) ) : C(j,i );
2400 C(i+1UL,j) =
HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
2406 reset( C(i+1UL,j) );
2410 for( ; (j+2UL) <= jend; j+=2UL )
2412 const size_t kbegin( ( IsUpper_v<MT4> )
2415 const size_t kend( ( IsLower_v<MT4> )
2416 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
2417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
2437 a2 = A.load(i+1UL,k);
2439 b2 = B.load(k,j+1UL);
2446 C(i ,j ) =
sum( xmm1 );
2447 C(i ,j+1UL) =
sum( xmm2 );
2448 C(i+1UL,j ) =
sum( xmm3 );
2449 C(i+1UL,j+1UL) =
sum( xmm4 );
2451 for( ; remainder && k<kend; ++k ) {
2452 C(i ,j ) += A(i ,k) * B(k,j );
2453 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
2454 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
2455 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
2465 for( ++k; k<kend; ++k ) {
2466 value1 += A(i ,k) * B(k,j );
2467 value2 += A(i ,k) * B(k,j+1UL);
2468 value3 += A(i+1UL,k) * B(k,j );
2469 value4 += A(i+1UL,k) * B(k,j+1UL);
2473 C(i ,j+1UL) = value2;
2474 C(i+1UL,j ) = value3;
2475 C(i+1UL,j+1UL) = value4;
2480 reset( C(i ,j+1UL) );
2481 reset( C(i+1UL,j ) );
2482 reset( C(i+1UL,j+1UL) );
2488 const size_t kbegin( ( IsUpper_v<MT4> )
2491 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
2501 SIMDType xmm1( A.load(i ,k) * b1 );
2502 SIMDType xmm2( A.load(i+1UL,k) * b1 );
2506 xmm1 += A.load(i ,k) * b1;
2507 xmm2 += A.load(i+1UL,k) * b1;
2510 C(i ,j) =
sum( xmm1 );
2511 C(i+1UL,j) =
sum( xmm2 );
2513 for( ; remainder && k<kend; ++k ) {
2514 C(i ,j) += A(i ,k) * B(k,j);
2515 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2523 for( ++k; k<kend; ++k ) {
2524 value1 += A(i ,k) * B(k,j);
2525 value2 += A(i+1UL,k) * B(k,j);
2529 C(i+1UL,j) = value2;
2534 reset( C(i+1UL,j) );
2543 reset( C(i+1UL,j) );
2550 const size_t jend(
LOW ? i+1UL : N );
2555 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
2564 for( ; !(
LOW &&
UPP ) && (j+2UL) <= jend; j+=2UL )
2566 const size_t kbegin( ( IsUpper_v<MT4> )
2569 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
2579 SIMDType xmm1( a1 * B.load(k,j ) );
2580 SIMDType xmm2( a1 * B.load(k,j+1UL) );
2584 xmm1 += a1 * B.load(k,j );
2585 xmm2 += a1 * B.load(k,j+1UL);
2588 C(i,j ) =
sum( xmm1 );
2589 C(i,j+1UL) =
sum( xmm2 );
2591 for( ; remainder && k<kend; ++k ) {
2592 C(i,j ) += A(i,k) * B(k,j );
2593 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2601 for( ++k; k<kend; ++k ) {
2602 value1 += A(i,k) * B(k,j );
2603 value2 += A(i,k) * B(k,j+1UL);
2607 C(i,j+1UL) = value2;
2612 reset( C(i,j+1UL) );
2618 const size_t kbegin( ( IsUpper_v<MT4> )
2629 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
2632 xmm1 += A.load(i,k) * B.load(k,j);
2635 C(i,j) =
sum( xmm1 );
2637 for( ; remainder && k<K; ++k ) {
2638 C(i,j) += A(i,k) * B(k,j);
2645 for( ++k; k<K; ++k ) {
2646 value += A(i,k) * B(k,j);
2683 template<
typename MT3
2686 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2687 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2689 selectDefaultAssignKernel( C, A, B );
2709 template<
typename MT3
2712 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2713 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2743 template<
typename MT3
2746 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2747 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2749 selectLargeAssignKernel( C, A, B );
2755#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2769 template<
typename MT3
2772 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2773 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2775 using ET = ElementType_t<MT3>;
2777 if( IsTriangular_v<MT4> ) {
2779 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2781 else if( IsTriangular_v<MT5> ) {
2783 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2786 gemm( C, A, B, ET(1), ET(0) );
2806 template<
typename MT
2808 friend inline void assign( SparseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2812 using TmpType = If_t< SO, OppositeType, ResultType >;
2824 const ForwardFunctor fwd;
2826 const TmpType tmp(
serial( rhs ) );
2827 assign( *lhs, fwd( tmp ) );
2845 template<
typename MT
2847 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
2854 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2868 DMatTDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2884 template<
typename MT3
2887 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2889 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
2890 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
2891 selectSmallAddAssignKernel( C, A, B );
2893 selectBlasAddAssignKernel( C, A, B );
2912 template<
typename MT3
2915 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2916 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2918 const size_t M( A.rows() );
2919 const size_t N( B.columns() );
2920 const size_t K( A.columns() );
2924 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
2925 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
2927 const size_t iend( ( IsStrictlyUpper_v<MT4> )
2928 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
2932 for(
size_t i=ibegin; i<iend; ++i )
2934 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
2935 ?( ( IsStrictlyUpper_v<MT4> )
2936 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
2937 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
2938 :( ( IsStrictlyUpper_v<MT5> )
2939 ?(
UPP ?
max( i, 1UL ) : 1UL )
2940 :(
UPP ? i : 0UL ) ) );
2941 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
2942 ?( ( IsStrictlyLower_v<MT4> )
2943 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
2944 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
2945 :( ( IsStrictlyLower_v<MT5> )
2946 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
2947 :(
LOW ? i+1UL : N ) ) );
2949 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
2952 for(
size_t j=jbegin; j<jend; ++j )
2954 const size_t kbegin( ( IsUpper_v<MT4> )
2955 ?( ( IsLower_v<MT5> )
2956 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2957 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2958 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2959 :( ( IsLower_v<MT5> )
2960 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2962 const size_t kend( ( IsLower_v<MT4> )
2963 ?( ( IsUpper_v<MT5> )
2964 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
2965 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2966 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2967 :( ( IsUpper_v<MT5> )
2968 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2972 const size_t knum( kend - kbegin );
2973 const size_t kpos( kbegin +
prevMultiple( knum, 2UL ) );
2976 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
2977 C(i,j) += A(i,k ) * B(k ,j);
2978 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
2981 C(i,j) += A(i,kpos) * B(kpos,j);
3003 template<
typename MT3
3006 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3007 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3009 const size_t M( A.rows() );
3010 const size_t N( B.columns() );
3011 const size_t K( A.columns() );
3015 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
3016 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
3018 const size_t jend( ( IsStrictlyLower_v<MT5> )
3019 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
3023 for(
size_t j=jbegin; j<jend; ++j )
3025 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
3026 ?( ( IsStrictlyLower_v<MT4> )
3027 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
3028 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3029 :( ( IsStrictlyLower_v<MT4> )
3030 ?(
LOW ?
max( j, 1UL ) : 1UL )
3031 :(
LOW ? j : 0UL ) ) );
3032 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
3033 ?( ( IsStrictlyUpper_v<MT4> )
3034 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
3035 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
3036 :( ( IsStrictlyUpper_v<MT4> )
3037 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
3038 :(
UPP ? j+1UL : M ) ) );
3040 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
3043 for(
size_t i=ibegin; i<iend; ++i )
3045 const size_t kbegin( ( IsUpper_v<MT4> )
3046 ?( ( IsLower_v<MT5> )
3047 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3048 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3049 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3050 :( ( IsLower_v<MT5> )
3051 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3053 const size_t kend( ( IsLower_v<MT4> )
3054 ?( ( IsUpper_v<MT5> )
3055 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
3056 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3057 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3058 :( ( IsUpper_v<MT5> )
3059 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3063 const size_t knum( kend - kbegin );
3064 const size_t kpos( kbegin +
prevMultiple( knum, 2UL ) );
3067 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
3068 C(i,j) += A(i,k ) * B(k ,j);
3069 C(i,j) += A(i,k+1UL) * B(k+1UL,j);
3072 C(i,j) += A(i,kpos) * B(kpos,j);
3094 template<
typename MT3
3097 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3098 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3100 const size_t M( A.rows() );
3101 const size_t N( B.columns() );
3103 for(
size_t i=0UL; i<M; ++i )
3105 const size_t jbegin( ( IsUpper_v<MT4> )
3106 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3108 const size_t jend( ( IsLower_v<MT4> )
3109 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3113 const size_t jnum( jend - jbegin );
3114 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3117 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3118 C(i,j ) += A(i,j ) * B(j ,j );
3119 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
3122 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
3143 template<
typename MT3
3146 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3147 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3149 constexpr size_t block( BLOCK_SIZE );
3151 const size_t M( A.rows() );
3152 const size_t N( B.columns() );
3154 for(
size_t jj=0UL; jj<N; jj+=block ) {
3155 const size_t jend(
min( N, jj+block ) );
3156 for(
size_t ii=0UL; ii<M; ii+=block ) {
3157 const size_t iend(
min( M, ii+block ) );
3158 for(
size_t j=jj; j<jend; ++j )
3160 const size_t ibegin( ( IsLower_v<MT4> )
3161 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
3163 const size_t ipos( ( IsUpper_v<MT4> )
3164 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
3167 for(
size_t i=ibegin; i<ipos; ++i ) {
3168 C(i,j) += A(i,j) * B(j,j);
3191 template<
typename MT3
3194 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3195 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3197 constexpr size_t block( BLOCK_SIZE );
3199 const size_t M( A.rows() );
3200 const size_t N( B.columns() );
3202 for(
size_t ii=0UL; ii<M; ii+=block ) {
3203 const size_t iend(
min( M, ii+block ) );
3204 for(
size_t jj=0UL; jj<N; jj+=block ) {
3205 const size_t jend(
min( N, jj+block ) );
3206 for(
size_t i=ii; i<iend; ++i )
3208 const size_t jbegin( ( IsUpper_v<MT5> )
3209 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
3211 const size_t jpos( ( IsLower_v<MT5> )
3212 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
3215 for(
size_t j=jbegin; j<jpos; ++j ) {
3216 C(i,j) += A(i,i) * B(i,j);
3239 template<
typename MT3
3242 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3243 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3245 const size_t M( A.rows() );
3246 const size_t N( B.columns() );
3248 for(
size_t j=0UL; j<N; ++j )
3250 const size_t ibegin( ( IsLower_v<MT5> )
3251 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3253 const size_t iend( ( IsUpper_v<MT5> )
3254 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3258 const size_t inum( iend - ibegin );
3259 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3262 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3263 C(i ,j) += A(i ,i ) * B(i ,j);
3264 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
3267 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
3288 template<
typename MT3
3291 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3292 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3294 for(
size_t i=0UL; i<A.rows(); ++i ) {
3295 C(i,i) += A(i,i) * B(i,i);
3315 template<
typename MT3
3318 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3319 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3321 selectDefaultAddAssignKernel( C, A, B );
3341 template<
typename MT3
3344 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3345 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3347 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3349 const size_t M( A.rows() );
3350 const size_t N( B.columns() );
3351 const size_t K( A.columns() );
3357 for( ; !(
LOW &&
UPP ) && (i+3UL) <= M; i+=3UL )
3359 const size_t jend(
LOW ? i+3UL : N );
3360 size_t j(
UPP ? i : 0UL );
3362 for( ; (j+3UL) <= jend; j+=3UL )
3364 const size_t kbegin( ( IsUpper_v<MT4> )
3367 const size_t kend( ( IsLower_v<MT4> )
3368 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
3369 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
3396 a2 = A.load(i+1UL,k);
3397 a3 = A.load(i+2UL,k);
3399 b2 = B.load(k,j+1UL);
3400 b3 = B.load(k,j+2UL);
3412 C(i ,j ) +=
sum( xmm1 );
3413 C(i ,j+1UL) +=
sum( xmm2 );
3414 C(i ,j+2UL) +=
sum( xmm3 );
3415 C(i+1UL,j ) +=
sum( xmm4 );
3416 C(i+1UL,j+1UL) +=
sum( xmm5 );
3417 C(i+1UL,j+2UL) +=
sum( xmm6 );
3418 C(i+2UL,j ) +=
sum( xmm7 );
3419 C(i+2UL,j+1UL) +=
sum( xmm8 );
3420 C(i+2UL,j+2UL) +=
sum( xmm9 );
3422 for( ; remainder && k<kend; ++k ) {
3423 C(i ,j ) += A(i ,k) * B(k,j );
3424 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3425 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3426 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3427 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3428 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3429 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3430 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3431 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
3436 for( ; k<kend; ++k ) {
3437 C(i ,j ) += A(i ,k) * B(k,j );
3438 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3439 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3440 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3441 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3442 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3443 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3444 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3445 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
3450 for( ; (j+2UL) <= jend; j+=2UL )
3452 const size_t kbegin( ( IsUpper_v<MT4> )
3455 const size_t kend( ( IsLower_v<MT4> )
3456 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
3457 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3480 a2 = A.load(i+1UL,k);
3481 a3 = A.load(i+2UL,k);
3483 b2 = B.load(k,j+1UL);
3492 C(i ,j ) +=
sum( xmm1 );
3493 C(i ,j+1UL) +=
sum( xmm2 );
3494 C(i+1UL,j ) +=
sum( xmm3 );
3495 C(i+1UL,j+1UL) +=
sum( xmm4 );
3496 C(i+2UL,j ) +=
sum( xmm5 );
3497 C(i+2UL,j+1UL) +=
sum( xmm6 );
3499 for( ; remainder && k<kend; ++k ) {
3500 C(i ,j ) += A(i ,k) * B(k,j );
3501 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3502 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3503 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3504 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3505 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3510 for( ; k<kend; ++k ) {
3511 C(i ,j ) += A(i ,k) * B(k,j );
3512 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3513 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3514 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3515 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3516 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3523 const size_t kbegin( ( IsUpper_v<MT4> )
3526 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
3536 SIMDType xmm1( A.load(i ,k) * b1 );
3537 SIMDType xmm2( A.load(i+1UL,k) * b1 );
3538 SIMDType xmm3( A.load(i+2UL,k) * b1 );
3542 xmm1 += A.load(i ,k) * b1;
3543 xmm2 += A.load(i+1UL,k) * b1;
3544 xmm3 += A.load(i+2UL,k) * b1;
3547 C(i ,j) +=
sum( xmm1 );
3548 C(i+1UL,j) +=
sum( xmm2 );
3549 C(i+2UL,j) +=
sum( xmm3 );
3551 for( ; remainder && k<kend; ++k ) {
3552 C(i ,j) += A(i ,k) * B(k,j);
3553 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3554 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
3559 for( ; k<kend; ++k ) {
3560 C(i ,j) += A(i ,k) * B(k,j);
3561 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3562 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
3568 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
3570 const size_t jend(
LOW ? i+2UL : N );
3571 size_t j(
UPP ? i : 0UL );
3573 for( ; (j+4UL) <= jend; j+=4UL )
3575 const size_t kbegin( ( IsUpper_v<MT4> )
3578 const size_t kend( ( IsLower_v<MT4> )
3579 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
3580 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
3606 a2 = A.load(i+1UL,k);
3608 b2 = B.load(k,j+1UL);
3609 b3 = B.load(k,j+2UL);
3610 b4 = B.load(k,j+3UL);
3621 C(i ,j ) +=
sum( xmm1 );
3622 C(i ,j+1UL) +=
sum( xmm2 );
3623 C(i ,j+2UL) +=
sum( xmm3 );
3624 C(i ,j+3UL) +=
sum( xmm4 );
3625 C(i+1UL,j ) +=
sum( xmm5 );
3626 C(i+1UL,j+1UL) +=
sum( xmm6 );
3627 C(i+1UL,j+2UL) +=
sum( xmm7 );
3628 C(i+1UL,j+3UL) +=
sum( xmm8 );
3630 for( ; remainder && k<kend; ++k ) {
3631 C(i ,j ) += A(i ,k) * B(k,j );
3632 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3633 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3634 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
3635 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3636 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3637 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3638 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
3643 for( ; k<kend; ++k ) {
3644 C(i ,j ) += A(i ,k) * B(k,j );
3645 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3646 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
3647 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL);
3648 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3649 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3650 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
3651 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL);
3656 for( ; (j+2UL) <= jend; j+=2UL )
3658 const size_t kbegin( ( IsUpper_v<MT4> )
3661 const size_t kend( ( IsLower_v<MT4> )
3662 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
3663 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3683 a2 = A.load(i+1UL,k);
3685 b2 = B.load(k,j+1UL);
3692 C(i ,j ) +=
sum( xmm1 );
3693 C(i ,j+1UL) +=
sum( xmm2 );
3694 C(i+1UL,j ) +=
sum( xmm3 );
3695 C(i+1UL,j+1UL) +=
sum( xmm4 );
3697 for( ; remainder && k<kend; ++k ) {
3698 C(i ,j ) += A(i ,k) * B(k,j );
3699 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3700 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3701 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3706 for( ; k<kend; ++k ) {
3707 C(i ,j ) += A(i ,k) * B(k,j );
3708 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3709 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3710 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3717 const size_t kbegin( ( IsUpper_v<MT4> )
3720 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
3730 SIMDType xmm1( A.load(i ,k) * b1 );
3731 SIMDType xmm2( A.load(i+1UL,k) * b1 );
3735 xmm1 += A.load(i ,k) * b1;
3736 xmm2 += A.load(i+1UL,k) * b1;
3739 C(i ,j) +=
sum( xmm1 );
3740 C(i+1UL,j) +=
sum( xmm2 );
3742 for( ; remainder && k<kend; ++k ) {
3743 C(i ,j) += A(i ,k) * B(k,j);
3744 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3749 for( ; k<kend; ++k ) {
3750 C(i ,j) += A(i ,k) * B(k,j);
3751 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
3759 const size_t jend(
LOW ? i+1UL : N );
3760 size_t j(
UPP ? i : 0UL );
3762 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
3764 const size_t kbegin( ( IsUpper_v<MT4> )
3767 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
3777 SIMDType xmm1( a1 * B.load(k,j ) );
3778 SIMDType xmm2( a1 * B.load(k,j+1UL) );
3779 SIMDType xmm3( a1 * B.load(k,j+2UL) );
3780 SIMDType xmm4( a1 * B.load(k,j+3UL) );
3784 xmm1 += a1 * B.load(k,j );
3785 xmm2 += a1 * B.load(k,j+1UL);
3786 xmm3 += a1 * B.load(k,j+2UL);
3787 xmm4 += a1 * B.load(k,j+3UL);
3790 C(i,j ) +=
sum( xmm1 );
3791 C(i,j+1UL) +=
sum( xmm2 );
3792 C(i,j+2UL) +=
sum( xmm3 );
3793 C(i,j+3UL) +=
sum( xmm4 );
3795 for( ; remainder && k<kend; ++k ) {
3796 C(i,j ) += A(i,k) * B(k,j );
3797 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3798 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
3799 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
3804 for( ; k<kend; ++k ) {
3805 C(i,j ) += A(i,k) * B(k,j );
3806 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3807 C(i,j+2UL) += A(i,k) * B(k,j+2UL);
3808 C(i,j+3UL) += A(i,k) * B(k,j+3UL);
3813 for( ; (j+2UL) <= jend; j+=2UL )
3815 const size_t kbegin( ( IsUpper_v<MT4> )
3818 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
3828 SIMDType xmm1( a1 * B.load(k,j ) );
3829 SIMDType xmm2( a1 * B.load(k,j+1UL) );
3833 xmm1 += a1 * B.load(k,j );
3834 xmm2 += a1 * B.load(k,j+1UL);
3837 C(i,j ) +=
sum( xmm1 );
3838 C(i,j+1UL) +=
sum( xmm2 );
3840 for( ; remainder && k<kend; ++k ) {
3841 C(i,j ) += A(i,k) * B(k,j );
3842 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3847 for( ; k<kend; ++k ) {
3848 C(i,j ) += A(i,k) * B(k,j );
3849 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
3856 const size_t kbegin( ( IsUpper_v<MT4> )
3867 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
3870 xmm1 += A.load(i,k) * B.load(k,j);
3873 C(i,j) +=
sum( xmm1 );
3875 for( ; remainder && k<K; ++k ) {
3876 C(i,j) += A(i,k) * B(k,j);
3882 C(i,j) += A(i,k) * B(k,j);
3906 template<
typename MT3
3909 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3910 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3912 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
3914 const size_t M( A.rows() );
3915 const size_t N( B.columns() );
3916 const size_t K( A.columns() );
3922 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
3926 for( ; (j+2UL) <= N; j+=2UL )
3928 const size_t kbegin( ( IsUpper_v<MT4> )
3931 const size_t kend( ( IsLower_v<MT4> )
3932 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
3933 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
3959 a2 = A.load(i+1UL,k);
3960 a3 = A.load(i+2UL,k);
3961 a4 = A.load(i+3UL,k);
3963 b2 = B.load(k,j+1UL);
3974 C(i ,j ) +=
sum( xmm1 );
3975 C(i ,j+1UL) +=
sum( xmm2 );
3976 C(i+1UL,j ) +=
sum( xmm3 );
3977 C(i+1UL,j+1UL) +=
sum( xmm4 );
3978 C(i+2UL,j ) +=
sum( xmm5 );
3979 C(i+2UL,j+1UL) +=
sum( xmm6 );
3980 C(i+3UL,j ) +=
sum( xmm7 );
3981 C(i+3UL,j+1UL) +=
sum( xmm8 );
3983 for( ; remainder && k<kend; ++k ) {
3984 C(i ,j ) += A(i ,k) * B(k,j );
3985 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3986 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
3987 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
3988 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
3989 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
3990 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
3991 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
3996 for( ; k<kend; ++k ) {
3997 C(i ,j ) += A(i ,k) * B(k,j );
3998 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
3999 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4000 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4001 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4002 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4003 C(i+3UL,j ) += A(i+3UL,k) * B(k,j );
4004 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL);
4011 const size_t kbegin( ( IsUpper_v<MT4> )
4014 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
4024 SIMDType xmm1( A.load(i ,k) * b1 );
4025 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4026 SIMDType xmm3( A.load(i+2UL,k) * b1 );
4027 SIMDType xmm4( A.load(i+3UL,k) * b1 );
4031 xmm1 += A.load(i ,k) * b1;
4032 xmm2 += A.load(i+1UL,k) * b1;
4033 xmm3 += A.load(i+2UL,k) * b1;
4034 xmm4 += A.load(i+3UL,k) * b1;
4037 C(i ,j) +=
sum( xmm1 );
4038 C(i+1UL,j) +=
sum( xmm2 );
4039 C(i+2UL,j) +=
sum( xmm3 );
4040 C(i+3UL,j) +=
sum( xmm4 );
4042 for( ; remainder && k<kend; ++k ) {
4043 C(i ,j) += A(i ,k) * B(k,j);
4044 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4045 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4046 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
4051 for( ; k<kend; ++k ) {
4052 C(i ,j) += A(i ,k) * B(k,j);
4053 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4054 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4055 C(i+3UL,j) += A(i+3UL,k) * B(k,j);
4061 for( ; !
LOW && !
UPP && (i+3UL) <= M; i+=3UL )
4065 for( ; (j+3UL) <= N; j+=3UL )
4067 const size_t kbegin( ( IsUpper_v<MT4> )
4070 const size_t kend( ( IsLower_v<MT4> )
4071 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
4072 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
4099 a2 = A.load(i+1UL,k);
4100 a3 = A.load(i+2UL,k);
4102 b2 = B.load(k,j+1UL);
4103 b3 = B.load(k,j+2UL);
4115 C(i ,j ) +=
sum( xmm1 );
4116 C(i ,j+1UL) +=
sum( xmm2 );
4117 C(i ,j+2UL) +=
sum( xmm3 );
4118 C(i+1UL,j ) +=
sum( xmm4 );
4119 C(i+1UL,j+1UL) +=
sum( xmm5 );
4120 C(i+1UL,j+2UL) +=
sum( xmm6 );
4121 C(i+2UL,j ) +=
sum( xmm7 );
4122 C(i+2UL,j+1UL) +=
sum( xmm8 );
4123 C(i+2UL,j+2UL) +=
sum( xmm9 );
4125 for( ; remainder && k<kend; ++k ) {
4126 C(i ,j ) += A(i ,k) * B(k,j );
4127 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4128 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
4129 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4130 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4131 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
4132 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4133 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4134 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
4139 for( ; k<kend; ++k ) {
4140 C(i ,j ) += A(i ,k) * B(k,j );
4141 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4142 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL);
4143 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4144 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4145 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL);
4146 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4147 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4148 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL);
4153 for( ; (j+2UL) <= N; j+=2UL )
4155 const size_t kbegin( ( IsUpper_v<MT4> )
4158 const size_t kend( ( IsLower_v<MT4> )
4159 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
4160 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
4183 a2 = A.load(i+1UL,k);
4184 a3 = A.load(i+2UL,k);
4186 b2 = B.load(k,j+1UL);
4195 C(i ,j ) +=
sum( xmm1 );
4196 C(i ,j+1UL) +=
sum( xmm2 );
4197 C(i+1UL,j ) +=
sum( xmm3 );
4198 C(i+1UL,j+1UL) +=
sum( xmm4 );
4199 C(i+2UL,j ) +=
sum( xmm5 );
4200 C(i+2UL,j+1UL) +=
sum( xmm6 );
4202 for( ; remainder && k<kend; ++k ) {
4203 C(i ,j ) += A(i ,k) * B(k,j );
4204 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4205 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4206 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4207 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4208 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4213 for( ; k<kend; ++k ) {
4214 C(i ,j ) += A(i ,k) * B(k,j );
4215 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4216 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4217 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4218 C(i+2UL,j ) += A(i+2UL,k) * B(k,j );
4219 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL);
4226 const size_t kbegin( ( IsUpper_v<MT4> )
4229 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
4239 SIMDType xmm1( A.load(i ,k) * b1 );
4240 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4241 SIMDType xmm3( A.load(i+2UL,k) * b1 );
4245 xmm1 += A.load(i ,k) * b1;
4246 xmm2 += A.load(i+1UL,k) * b1;
4247 xmm3 += A.load(i+2UL,k) * b1;
4250 C(i ,j) +=
sum( xmm1 );
4251 C(i+1UL,j) +=
sum( xmm2 );
4252 C(i+2UL,j) +=
sum( xmm3 );
4254 for( ; remainder && k<kend; ++k ) {
4255 C(i ,j) += A(i ,k) * B(k,j);
4256 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4257 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4262 for( ; k<kend; ++k ) {
4263 C(i ,j) += A(i ,k) * B(k,j);
4264 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4265 C(i+2UL,j) += A(i+2UL,k) * B(k,j);
4271 for( ; (i+2UL) <= M; i+=2UL )
4273 const size_t jend(
LOW ? i+2UL : N );
4274 size_t j(
UPP ? i : 0UL );
4276 for( ; (j+2UL) <= jend; j+=2UL )
4278 const size_t kbegin( ( IsUpper_v<MT4> )
4281 const size_t kend( ( IsLower_v<MT4> )
4282 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
4283 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
4303 a2 = A.load(i+1UL,k);
4305 b2 = B.load(k,j+1UL);
4312 C(i ,j ) +=
sum( xmm1 );
4313 C(i ,j+1UL) +=
sum( xmm2 );
4314 C(i+1UL,j ) +=
sum( xmm3 );
4315 C(i+1UL,j+1UL) +=
sum( xmm4 );
4317 for( ; remainder && k<kend; ++k ) {
4318 C(i ,j ) += A(i ,k) * B(k,j );
4319 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4320 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4321 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4326 for( ; k<kend; ++k ) {
4327 C(i ,j ) += A(i ,k) * B(k,j );
4328 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL);
4329 C(i+1UL,j ) += A(i+1UL,k) * B(k,j );
4330 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL);
4337 const size_t kbegin( ( IsUpper_v<MT4> )
4340 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
4350 SIMDType xmm1( A.load(i ,k) * b1 );
4351 SIMDType xmm2( A.load(i+1UL,k) * b1 );
4355 xmm1 += A.load(i ,k) * b1;
4356 xmm2 += A.load(i+1UL,k) * b1;
4359 C(i ,j) +=
sum( xmm1 );
4360 C(i+1UL,j) +=
sum( xmm2 );
4362 for( ; remainder && k<kend; ++k ) {
4363 C(i ,j) += A(i ,k) * B(k,j);
4364 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4369 for( ; k<kend; ++k ) {
4370 C(i ,j) += A(i ,k) * B(k,j);
4371 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
4379 const size_t jend(
LOW ? i+1UL : N );
4380 size_t j(
UPP ? i : 0UL );
4382 for( ; (j+2UL) <= jend; j+=2UL )
4384 const size_t kbegin( ( IsUpper_v<MT4> )
4387 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
4397 SIMDType xmm1( a1 * B.load(k,j ) );
4398 SIMDType xmm2( a1 * B.load(k,j+1UL) );
4402 xmm1 += a1 * B.load(k,j );
4403 xmm2 += a1 * B.load(k,j+1UL);
4406 C(i,j ) +=
sum( xmm1 );
4407 C(i,j+1UL) +=
sum( xmm2 );
4409 for( ; remainder && k<kend; ++k ) {
4410 C(i,j ) += A(i,k) * B(k,j );
4411 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
4416 for( ; k<kend; ++k ) {
4417 C(i,j ) += A(i,k) * B(k,j );
4418 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
4425 const size_t kbegin( ( IsUpper_v<MT4> )
4436 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
4439 xmm1 += A.load(i,k) * B.load(k,j);
4442 C(i,j) +=
sum( xmm1 );
4444 for( ; remainder && k<K; ++k ) {
4445 C(i,j) += A(i,k) * B(k,j);
4451 C(i,j) += A(i,k) * B(k,j);
4474 template<
typename MT3
4477 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4478 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4480 selectDefaultAddAssignKernel( C, A, B );
4500 template<
typename MT3
4503 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4504 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4530 template<
typename MT3
4533 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4534 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4536 selectLargeAddAssignKernel( C, A, B );
4542#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4556 template<
typename MT3
4559 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4560 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4562 using ET = ElementType_t<MT3>;
4564 if( IsTriangular_v<MT4> ) {
4565 ResultType_t<MT3> tmp(
serial( B ) );
4566 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4567 addAssign( C, tmp );
4569 else if( IsTriangular_v<MT5> ) {
4570 ResultType_t<MT3> tmp(
serial( A ) );
4571 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4572 addAssign( C, tmp );
4575 gemm( C, A, B, ET(1), ET(1) );
4599 template<
typename MT
4601 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
4608 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4622 DMatTDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
4638 template<
typename MT3
4641 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4643 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
4644 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
4645 selectSmallSubAssignKernel( C, A, B );
4647 selectBlasSubAssignKernel( C, A, B );
4666 template<
typename MT3
4669 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4670 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4672 const size_t M( A.rows() );
4673 const size_t N( B.columns() );
4674 const size_t K( A.columns() );
4678 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
4679 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
4681 const size_t iend( ( IsStrictlyUpper_v<MT4> )
4682 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
4686 for(
size_t i=ibegin; i<iend; ++i )
4688 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4689 ?( ( IsStrictlyUpper_v<MT4> )
4690 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
4691 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
4692 :( ( IsStrictlyUpper_v<MT5> )
4693 ?(
UPP ?
max( i, 1UL ) : 1UL )
4694 :(
UPP ? i : 0UL ) ) );
4695 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
4696 ?( ( IsStrictlyLower_v<MT4> )
4697 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
4698 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
4699 :( ( IsStrictlyLower_v<MT5> )
4700 ?(
LOW ?
min(i+1UL,N-1UL) : N-1UL )
4701 :(
LOW ? i+1UL : N ) ) );
4703 if( (
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
4706 for(
size_t j=jbegin; j<jend; ++j )
4708 const size_t kbegin( ( IsUpper_v<MT4> )
4709 ?( ( IsLower_v<MT5> )
4710 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4711 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4712 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4713 :( ( IsLower_v<MT5> )
4714 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4716 const size_t kend( ( IsLower_v<MT4> )
4717 ?( ( IsUpper_v<MT5> )
4718 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4719 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4720 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4721 :( ( IsUpper_v<MT5> )
4722 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4726 const size_t knum( kend - kbegin );
4727 const size_t kpos( kbegin +
prevMultiple( knum, 2UL ) );
4730 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
4731 C(i,j) -= A(i,k ) * B(k ,j);
4732 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
4735 C(i,j) -= A(i,kpos) * B(kpos,j);
4757 template<
typename MT3
4760 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4761 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4763 const size_t M( A.rows() );
4764 const size_t N( B.columns() );
4765 const size_t K( A.columns() );
4769 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
4770 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
4772 const size_t jend( ( IsStrictlyLower_v<MT5> )
4773 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
4777 for(
size_t j=jbegin; j<jend; ++j )
4779 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
4780 ?( ( IsStrictlyLower_v<MT4> )
4781 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
4782 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4783 :( ( IsStrictlyLower_v<MT4> )
4784 ?(
LOW ?
max( j, 1UL ) : 1UL )
4785 :(
LOW ? j : 0UL ) ) );
4786 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
4787 ?( ( IsStrictlyUpper_v<MT4> )
4788 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
4789 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
4790 :( ( IsStrictlyUpper_v<MT4> )
4791 ?(
UPP ?
min(j+1UL,M-1UL) : M-1UL )
4792 :(
UPP ? j+1UL : M ) ) );
4794 if( (
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
4797 for(
size_t i=ibegin; i<iend; ++i )
4799 const size_t kbegin( ( IsUpper_v<MT4> )
4800 ?( ( IsLower_v<MT5> )
4801 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4802 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4803 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4804 :( ( IsLower_v<MT5> )
4805 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
4807 const size_t kend( ( IsLower_v<MT4> )
4808 ?( ( IsUpper_v<MT5> )
4809 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
4810 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
4811 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
4812 :( ( IsUpper_v<MT5> )
4813 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
4817 const size_t knum( kend - kbegin );
4818 const size_t kpos( kbegin +
prevMultiple( knum, 2UL ) );
4821 for(
size_t k=kbegin; k<kpos; k+=2UL ) {
4822 C(i,j) -= A(i,k ) * B(k ,j);
4823 C(i,j) -= A(i,k+1UL) * B(k+1UL,j);
4826 C(i,j) -= A(i,kpos) * B(kpos,j);
4848 template<
typename MT3
4851 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4852 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4854 const size_t M( A.rows() );
4855 const size_t N( B.columns() );
4857 for(
size_t i=0UL; i<M; ++i )
4859 const size_t jbegin( ( IsUpper_v<MT4> )
4860 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
4862 const size_t jend( ( IsLower_v<MT4> )
4863 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
4867 const size_t jnum( jend - jbegin );
4868 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
4871 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
4872 C(i,j ) -= A(i,j ) * B(j ,j );
4873 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
4876 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
4897 template<
typename MT3
4900 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4901 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
4903 constexpr size_t block( BLOCK_SIZE );
4905 const size_t M( A.rows() );
4906 const size_t N( B.columns() );
4908 for(
size_t jj=0UL; jj<N; jj+=block ) {
4909 const size_t jend(
min( N, jj+block ) );
4910 for(
size_t ii=0UL; ii<M; ii+=block ) {
4911 const size_t iend(
min( M, ii+block ) );
4912 for(
size_t j=jj; j<jend; ++j )
4914 const size_t ibegin( ( IsLower_v<MT4> )
4915 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
4917 const size_t ipos( ( IsUpper_v<MT4> )
4918 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
4921 for(
size_t i=ibegin; i<ipos; ++i ) {
4922 C(i,j) -= A(i,j) * B(j,j);
4945 template<
typename MT3
4948 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4949 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4951 constexpr size_t block( BLOCK_SIZE );
4953 const size_t M( A.rows() );
4954 const size_t N( B.columns() );
4956 for(
size_t ii=0UL; ii<M; ii+=block ) {
4957 const size_t iend(
min( M, ii+block ) );
4958 for(
size_t jj=0UL; jj<N; jj+=block ) {
4959 const size_t jend(
min( N, jj+block ) );
4960 for(
size_t i=ii; i<iend; ++i )
4962 const size_t jbegin( ( IsUpper_v<MT5> )
4963 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
4965 const size_t jpos( ( IsLower_v<MT5> )
4966 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
4969 for(
size_t j=jbegin; j<jpos; ++j ) {
4970 C(i,j) -= A(i,i) * B(i,j);
4993 template<
typename MT3
4996 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4997 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
4999 const size_t M( A.rows() );
5000 const size_t N( B.columns() );
5002 for(
size_t j=0UL; j<N; ++j )
5004 const size_t ibegin( ( IsLower_v<MT5> )
5005 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5007 const size_t iend( ( IsUpper_v<MT5> )
5008 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5012 const size_t inum( iend - ibegin );
5013 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
5016 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
5017 C(i ,j) -= A(i ,i ) * B(i ,j);
5018 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
5021 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
5042 template<
typename MT3
5045 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5046 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5048 for(
size_t i=0UL; i<A.rows(); ++i ) {
5049 C(i,i) -= A(i,i) * B(i,i);
5069 template<
typename MT3
5072 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5073 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5075 selectDefaultSubAssignKernel( C, A, B );
5095 template<
typename MT3
5098 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5099 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5101 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5103 const size_t M( A.rows() );
5104 const size_t N( B.columns() );
5105 const size_t K( A.columns() );
5111 for( ; !(
LOW &&
UPP ) && (i+3UL) <= M; i+=3UL )
5113 const size_t jend(
LOW ? i+3UL : N );
5114 size_t j(
UPP ? i : 0UL );
5116 for( ; (j+3UL) <= jend; j+=3UL )
5118 const size_t kbegin( ( IsUpper_v<MT4> )
5121 const size_t kend( ( IsLower_v<MT4> )
5122 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
5123 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
5150 a2 = A.load(i+1UL,k);
5151 a3 = A.load(i+2UL,k);
5153 b2 = B.load(k,j+1UL);
5154 b3 = B.load(k,j+2UL);
5166 C(i ,j ) -=
sum( xmm1 );
5167 C(i ,j+1UL) -=
sum( xmm2 );
5168 C(i ,j+2UL) -=
sum( xmm3 );
5169 C(i+1UL,j ) -=
sum( xmm4 );
5170 C(i+1UL,j+1UL) -=
sum( xmm5 );
5171 C(i+1UL,j+2UL) -=
sum( xmm6 );
5172 C(i+2UL,j ) -=
sum( xmm7 );
5173 C(i+2UL,j+1UL) -=
sum( xmm8 );
5174 C(i+2UL,j+2UL) -=
sum( xmm9 );
5176 for( ; remainder && k<kend; ++k ) {
5177 C(i ,j ) -= A(i ,k) * B(k,j );
5178 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5179 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5180 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5181 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5182 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5183 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5184 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5185 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5190 for( ; k<kend; ++k ) {
5191 C(i ,j ) -= A(i ,k) * B(k,j );
5192 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5193 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5194 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5195 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5196 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5197 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5198 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5199 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5204 for( ; (j+2UL) <= jend; j+=2UL )
5206 const size_t kbegin( ( IsUpper_v<MT4> )
5209 const size_t kend( ( IsLower_v<MT4> )
5210 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
5211 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5234 a2 = A.load(i+1UL,k);
5235 a3 = A.load(i+2UL,k);
5237 b2 = B.load(k,j+1UL);
5246 C(i ,j ) -=
sum( xmm1 );
5247 C(i ,j+1UL) -=
sum( xmm2 );
5248 C(i+1UL,j ) -=
sum( xmm3 );
5249 C(i+1UL,j+1UL) -=
sum( xmm4 );
5250 C(i+2UL,j ) -=
sum( xmm5 );
5251 C(i+2UL,j+1UL) -=
sum( xmm6 );
5253 for( ; remainder && k<kend; ++k ) {
5254 C(i ,j ) -= A(i ,k) * B(k,j );
5255 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5256 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5257 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5258 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5259 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5264 for( ; k<kend; ++k ) {
5265 C(i ,j ) -= A(i ,k) * B(k,j );
5266 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5267 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5268 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5269 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5270 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5277 const size_t kbegin( ( IsUpper_v<MT4> )
5280 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
5290 SIMDType xmm1( A.load(i ,k) * b1 );
5291 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5292 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5296 xmm1 += A.load(i ,k) * b1;
5297 xmm2 += A.load(i+1UL,k) * b1;
5298 xmm3 += A.load(i+2UL,k) * b1;
5301 C(i ,j) -=
sum( xmm1 );
5302 C(i+1UL,j) -=
sum( xmm2 );
5303 C(i+2UL,j) -=
sum( xmm3 );
5305 for( ; remainder && k<kend; ++k ) {
5306 C(i ,j) -= A(i ,k) * B(k,j);
5307 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5308 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5313 for( ; k<kend; ++k ) {
5314 C(i ,j) -= A(i ,k) * B(k,j);
5315 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5316 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5322 for( ; !(
LOW &&
UPP ) && (i+2UL) <= M; i+=2UL )
5324 const size_t jend(
LOW ? i+2UL : N );
5325 size_t j(
UPP ? i : 0UL );
5327 for( ; (j+4UL) <= jend; j+=4UL )
5329 const size_t kbegin( ( IsUpper_v<MT4> )
5332 const size_t kend( ( IsLower_v<MT4> )
5333 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
5334 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
5360 a2 = A.load(i+1UL,k);
5362 b2 = B.load(k,j+1UL);
5363 b3 = B.load(k,j+2UL);
5364 b4 = B.load(k,j+3UL);
5375 C(i ,j ) -=
sum( xmm1 );
5376 C(i ,j+1UL) -=
sum( xmm2 );
5377 C(i ,j+2UL) -=
sum( xmm3 );
5378 C(i ,j+3UL) -=
sum( xmm4 );
5379 C(i+1UL,j ) -=
sum( xmm5 );
5380 C(i+1UL,j+1UL) -=
sum( xmm6 );
5381 C(i+1UL,j+2UL) -=
sum( xmm7 );
5382 C(i+1UL,j+3UL) -=
sum( xmm8 );
5384 for( ; remainder && k<kend; ++k ) {
5385 C(i ,j ) -= A(i ,k) * B(k,j );
5386 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5387 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5388 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
5389 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5390 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5391 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5392 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
5397 for( ; k<kend; ++k ) {
5398 C(i ,j ) -= A(i ,k) * B(k,j );
5399 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5400 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5401 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL);
5402 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5403 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5404 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5405 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL);
5410 for( ; (j+2UL) <= jend; j+=2UL )
5412 const size_t kbegin( ( IsUpper_v<MT4> )
5415 const size_t kend( ( IsLower_v<MT4> )
5416 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
5417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5437 a2 = A.load(i+1UL,k);
5439 b2 = B.load(k,j+1UL);
5446 C(i ,j ) -=
sum( xmm1 );
5447 C(i ,j+1UL) -=
sum( xmm2 );
5448 C(i+1UL,j ) -=
sum( xmm3 );
5449 C(i+1UL,j+1UL) -=
sum( xmm4 );
5451 for( ; remainder && k<kend; ++k ) {
5452 C(i ,j ) -= A(i ,k) * B(k,j );
5453 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5454 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5455 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5460 for( ; k<kend; ++k ) {
5461 C(i ,j ) -= A(i ,k) * B(k,j );
5462 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5463 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5464 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5471 const size_t kbegin( ( IsUpper_v<MT4> )
5474 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
5484 SIMDType xmm1( A.load(i ,k) * b1 );
5485 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5489 xmm1 += A.load(i ,k) * b1;
5490 xmm2 += A.load(i+1UL,k) * b1;
5493 C(i ,j) -=
sum( xmm1 );
5494 C(i+1UL,j) -=
sum( xmm2 );
5496 for( ; remainder && k<kend; ++k ) {
5497 C(i ,j) -= A(i ,k) * B(k,j);
5498 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5503 for( ; k<kend; ++k ) {
5504 C(i ,j) -= A(i ,k) * B(k,j);
5505 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5513 const size_t jend(
LOW ? i+1UL : N );
5514 size_t j(
UPP ? i : 0UL );
5516 for( ; !(
LOW &&
UPP ) && (j+4UL) <= jend; j+=4UL )
5518 const size_t kbegin( ( IsUpper_v<MT4> )
5521 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
5531 SIMDType xmm1( a1 * B.load(k,j ) );
5532 SIMDType xmm2( a1 * B.load(k,j+1UL) );
5533 SIMDType xmm3( a1 * B.load(k,j+2UL) );
5534 SIMDType xmm4( a1 * B.load(k,j+3UL) );
5538 xmm1 += a1 * B.load(k,j );
5539 xmm2 += a1 * B.load(k,j+1UL);
5540 xmm3 += a1 * B.load(k,j+2UL);
5541 xmm4 += a1 * B.load(k,j+3UL);
5544 C(i,j ) -=
sum( xmm1 );
5545 C(i,j+1UL) -=
sum( xmm2 );
5546 C(i,j+2UL) -=
sum( xmm3 );
5547 C(i,j+3UL) -=
sum( xmm4 );
5549 for( ; remainder && k<kend; ++k ) {
5550 C(i,j ) -= A(i,k) * B(k,j );
5551 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5552 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
5553 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
5558 for( ; k<kend; ++k ) {
5559 C(i,j ) -= A(i,k) * B(k,j );
5560 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5561 C(i,j+2UL) -= A(i,k) * B(k,j+2UL);
5562 C(i,j+3UL) -= A(i,k) * B(k,j+3UL);
5567 for( ; (j+2UL) <= jend; j+=2UL )
5569 const size_t kbegin( ( IsUpper_v<MT4> )
5572 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
5582 SIMDType xmm1( a1 * B.load(k,j ) );
5583 SIMDType xmm2( a1 * B.load(k,j+1UL) );
5587 xmm1 += a1 * B.load(k,j );
5588 xmm2 += a1 * B.load(k,j+1UL);
5591 C(i,j ) -=
sum( xmm1 );
5592 C(i,j+1UL) -=
sum( xmm2 );
5594 for( ; remainder && k<kend; ++k ) {
5595 C(i,j ) -= A(i,k) * B(k,j );
5596 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5601 for( ; k<kend; ++k ) {
5602 C(i,j ) -= A(i,k) * B(k,j );
5603 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
5610 const size_t kbegin( ( IsUpper_v<MT4> )
5621 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
5624 xmm1 += A.load(i,k) * B.load(k,j);
5627 C(i,j) -=
sum( xmm1 );
5629 for( ; remainder && k<K; ++k ) {
5630 C(i,j) -= A(i,k) * B(k,j);
5636 C(i,j) -= A(i,k) * B(k,j);
5660 template<
typename MT3
5663 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
5664 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
5666 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
5668 const size_t M( A.rows() );
5669 const size_t N( B.columns() );
5670 const size_t K( A.columns() );
5676 for( ; !
LOW && !
UPP && (i+4UL) <= M; i+=4UL )
5680 for( ; (j+2UL) <= N; j+=2UL )
5682 const size_t kbegin( ( IsUpper_v<MT4> )
5685 const size_t kend( ( IsLower_v<MT4> )
5686 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
5687 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5713 a2 = A.load(i+1UL,k);
5714 a3 = A.load(i+2UL,k);
5715 a4 = A.load(i+3UL,k);
5717 b2 = B.load(k,j+1UL);
5728 C(i ,j ) -=
sum( xmm1 );
5729 C(i ,j+1UL) -=
sum( xmm2 );
5730 C(i+1UL,j ) -=
sum( xmm3 );
5731 C(i+1UL,j+1UL) -=
sum( xmm4 );
5732 C(i+2UL,j ) -=
sum( xmm5 );
5733 C(i+2UL,j+1UL) -=
sum( xmm6 );
5734 C(i+3UL,j ) -=
sum( xmm7 );
5735 C(i+3UL,j+1UL) -=
sum( xmm8 );
5737 for( ; remainder && k<kend; ++k ) {
5738 C(i ,j ) -= A(i ,k) * B(k,j );
5739 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5740 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5741 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5742 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5743 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5744 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
5745 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
5750 for( ; k<kend; ++k ) {
5751 C(i ,j ) -= A(i ,k) * B(k,j );
5752 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5753 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5754 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5755 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5756 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5757 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j );
5758 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL);
5765 const size_t kbegin( ( IsUpper_v<MT4> )
5768 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
5778 SIMDType xmm1( A.load(i ,k) * b1 );
5779 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5780 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5781 SIMDType xmm4( A.load(i+3UL,k) * b1 );
5785 xmm1 += A.load(i ,k) * b1;
5786 xmm2 += A.load(i+1UL,k) * b1;
5787 xmm3 += A.load(i+2UL,k) * b1;
5788 xmm4 += A.load(i+3UL,k) * b1;
5791 C(i ,j) -=
sum( xmm1 );
5792 C(i+1UL,j) -=
sum( xmm2 );
5793 C(i+2UL,j) -=
sum( xmm3 );
5794 C(i+3UL,j) -=
sum( xmm4 );
5796 for( ; remainder && k<kend; ++k ) {
5797 C(i ,j) -= A(i ,k) * B(k,j);
5798 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5799 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5800 C(i+3UL,j) -= A(i+3UL,k) * B(k,j);
5805 for( ; k<kend; ++k ) {
5806 C(i ,j) -= A(i ,k) * B(k,j);
5807 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
5808 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
5809 C(i+3UL,j) -= A(i+3UL,k) * B(k,j);
5815 for( ; !
LOW && !
UPP && (i+3UL) <= M; i+=3UL )
5819 for( ; (j+3UL) <= N; j+=3UL )
5821 const size_t kbegin( ( IsUpper_v<MT4> )
5824 const size_t kend( ( IsLower_v<MT4> )
5825 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
5826 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
5853 a2 = A.load(i+1UL,k);
5854 a3 = A.load(i+2UL,k);
5856 b2 = B.load(k,j+1UL);
5857 b3 = B.load(k,j+2UL);
5869 C(i ,j ) -=
sum( xmm1 );
5870 C(i ,j+1UL) -=
sum( xmm2 );
5871 C(i ,j+2UL) -=
sum( xmm3 );
5872 C(i+1UL,j ) -=
sum( xmm4 );
5873 C(i+1UL,j+1UL) -=
sum( xmm5 );
5874 C(i+1UL,j+2UL) -=
sum( xmm6 );
5875 C(i+2UL,j ) -=
sum( xmm7 );
5876 C(i+2UL,j+1UL) -=
sum( xmm8 );
5877 C(i+2UL,j+2UL) -=
sum( xmm9 );
5879 for( ; remainder && k<kend; ++k ) {
5880 C(i ,j ) -= A(i ,k) * B(k,j );
5881 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5882 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5883 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5884 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5885 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5886 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5887 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5888 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5893 for( ; k<kend; ++k ) {
5894 C(i ,j ) -= A(i ,k) * B(k,j );
5895 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5896 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL);
5897 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5898 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5899 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL);
5900 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5901 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5902 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL);
5907 for( ; (j+2UL) <= N; j+=2UL )
5909 const size_t kbegin( ( IsUpper_v<MT4> )
5912 const size_t kend( ( IsLower_v<MT4> )
5913 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
5914 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
5937 a2 = A.load(i+1UL,k);
5938 a3 = A.load(i+2UL,k);
5940 b2 = B.load(k,j+1UL);
5949 C(i ,j ) -=
sum( xmm1 );
5950 C(i ,j+1UL) -=
sum( xmm2 );
5951 C(i+1UL,j ) -=
sum( xmm3 );
5952 C(i+1UL,j+1UL) -=
sum( xmm4 );
5953 C(i+2UL,j ) -=
sum( xmm5 );
5954 C(i+2UL,j+1UL) -=
sum( xmm6 );
5956 for( ; remainder && k<kend; ++k ) {
5957 C(i ,j ) -= A(i ,k) * B(k,j );
5958 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5959 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5960 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5961 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5962 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5967 for( ; k<kend; ++k ) {
5968 C(i ,j ) -= A(i ,k) * B(k,j );
5969 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
5970 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
5971 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
5972 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j );
5973 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL);
5980 const size_t kbegin( ( IsUpper_v<MT4> )
5983 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
5993 SIMDType xmm1( A.load(i ,k) * b1 );
5994 SIMDType xmm2( A.load(i+1UL,k) * b1 );
5995 SIMDType xmm3( A.load(i+2UL,k) * b1 );
5999 xmm1 += A.load(i ,k) * b1;
6000 xmm2 += A.load(i+1UL,k) * b1;
6001 xmm3 += A.load(i+2UL,k) * b1;
6004 C(i ,j) -=
sum( xmm1 );
6005 C(i+1UL,j) -=
sum( xmm2 );
6006 C(i+2UL,j) -=
sum( xmm3 );
6008 for( ; remainder && k<kend; ++k ) {
6009 C(i ,j) -= A(i ,k) * B(k,j);
6010 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6011 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
6016 for( ; k<kend; ++k ) {
6017 C(i ,j) -= A(i ,k) * B(k,j);
6018 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6019 C(i+2UL,j) -= A(i+2UL,k) * B(k,j);
6025 for( ; (i+2UL) <= M; i+=2UL )
6027 const size_t jend(
LOW ? i+2UL : N );
6028 size_t j(
UPP ? i : 0UL );
6030 for( ; (j+2UL) <= jend; j+=2UL )
6032 const size_t kbegin( ( IsUpper_v<MT4> )
6035 const size_t kend( ( IsLower_v<MT4> )
6036 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
6037 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
6057 a2 = A.load(i+1UL,k);
6059 b2 = B.load(k,j+1UL);
6066 C(i ,j ) -=
sum( xmm1 );
6067 C(i ,j+1UL) -=
sum( xmm2 );
6068 C(i+1UL,j ) -=
sum( xmm3 );
6069 C(i+1UL,j+1UL) -=
sum( xmm4 );
6071 for( ; remainder && k<kend; ++k ) {
6072 C(i ,j ) -= A(i ,k) * B(k,j );
6073 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
6074 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
6075 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
6080 for( ; k<kend; ++k ) {
6081 C(i ,j ) -= A(i ,k) * B(k,j );
6082 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL);
6083 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j );
6084 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL);
6091 const size_t kbegin( ( IsUpper_v<MT4> )
6094 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
6104 SIMDType xmm1( A.load(i ,k) * b1 );
6105 SIMDType xmm2( A.load(i+1UL,k) * b1 );
6109 xmm1 += A.load(i ,k) * b1;
6110 xmm2 += A.load(i+1UL,k) * b1;
6113 C(i ,j) -=
sum( xmm1 );
6114 C(i+1UL,j) -=
sum( xmm2 );
6116 for( ; remainder && k<kend; ++k ) {
6117 C(i ,j) -= A(i ,k) * B(k,j);
6118 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6123 for( ; k<kend; ++k ) {
6124 C(i ,j) -= A(i ,k) * B(k,j);
6125 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
6133 const size_t jend(
LOW ? i+1UL : N );
6134 size_t j(
UPP ? i : 0UL );
6136 for( ; (j+2UL) <= jend; j+=2UL )
6138 const size_t kbegin( ( IsUpper_v<MT4> )
6141 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
6151 SIMDType xmm1( a1 * B.load(k,j ) );
6152 SIMDType xmm2( a1 * B.load(k,j+1UL) );
6156 xmm1 += a1 * B.load(k,j );
6157 xmm2 += a1 * B.load(k,j+1UL);
6160 C(i,j ) -=
sum( xmm1 );
6161 C(i,j+1UL) -=
sum( xmm2 );
6163 for( ; remainder && k<kend; ++k ) {
6164 C(i,j ) -= A(i,k) * B(k,j );
6165 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
6170 for( ; k<kend; ++k ) {
6171 C(i,j ) -= A(i,k) * B(k,j );
6172 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
6179 const size_t kbegin( ( IsUpper_v<MT4> )
6190 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
6193 xmm1 += A.load(i,k) * B.load(k,j);
6196 C(i,j) -=
sum( xmm1 );
6198 for( ; remainder && k<K; ++k ) {
6199 C(i,j) -= A(i,k) * B(k,j);
6205 C(i,j) -= A(i,k) * B(k,j);
6228 template<
typename MT3
6231 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6232 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6234 selectDefaultSubAssignKernel( C, A, B );
6254 template<
typename MT3
6257 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6258 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
6284 template<
typename MT3
6287 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6288 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6290 selectLargeSubAssignKernel( C, A, B );
6296#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6310 template<
typename MT3
6313 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
6314 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
6316 using ET = ElementType_t<MT3>;
6318 if( IsTriangular_v<MT4> ) {
6319 ResultType_t<MT3> tmp(
serial( B ) );
6320 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
6321 subAssign( C, tmp );
6323 else if( IsTriangular_v<MT5> ) {
6324 ResultType_t<MT3> tmp(
serial( A ) );
6325 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
6326 subAssign( C, tmp );
6329 gemm( C, A, B, ET(-1), ET(1) );
6353 template<
typename MT
6355 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatTDMatMultExpr& rhs )
6367 schurAssign( *lhs, tmp );
6399 template<
typename MT
6402 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6409 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
6412 else if( rhs.lhs_.columns() == 0UL ) {
6447 template<
typename MT
6450 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6454 using TmpType = If_t< SO, OppositeType, ResultType >;
6466 const ForwardFunctor fwd;
6468 const TmpType tmp( rhs );
6490 template<
typename MT
6493 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6500 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6539 template<
typename MT
6542 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
6549 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
6585 template<
typename MT
6645template<
typename MT1
6652class DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
6653 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
6654 ,
private Computation
6659 using MMM = DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6661 using RES = ResultType_t<MMM>;
6662 using RT1 = ResultType_t<MT1>;
6663 using RT2 = ResultType_t<MT2>;
6664 using ET1 = ElementType_t<RT1>;
6665 using ET2 = ElementType_t<RT2>;
6666 using CT1 = CompositeType_t<MT1>;
6667 using CT2 = CompositeType_t<MT2>;
6672 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
6677 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
6681 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
6682 static constexpr bool HERM = ( HF && !( LF || UF ) );
6683 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
6684 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
6692 template<
typename T1,
typename T2,
typename T3 >
6693 static constexpr bool IsEvaluationRequired_v = ( evaluateLeft || evaluateRight );
6700 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6701 static constexpr bool UseBlasKernel_v =
6703 !SYM && !HERM && !LOW && !UPP &&
6704 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
6705 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
6706 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
6707 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6708 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6709 IsBLASCompatible_v< ElementType_t<T1> > &&
6710 IsBLASCompatible_v< ElementType_t<T2> > &&
6711 IsBLASCompatible_v< ElementType_t<T3> > &&
6712 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
6713 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
6714 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
6721 template<
typename T1,
typename T2,
typename T3,
typename T4 >
6722 static constexpr bool UseVectorizedDefaultKernel_v =
6723 ( useOptimizedKernels &&
6724 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
6725 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
6726 IsSIMDCombinable_v< ElementType_t<T1>
6730 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
6731 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
6738 using ForwardFunctor =
If_t< HERM
6754 using This = DMatScalarMultExpr<MMM,ST,false>;
6757 using BaseType = MatScalarMultExpr< DenseMatrix<This,false> >;
6761 , DeclHermTrait< MultTrait_t<RES,ST> >
6763 , DeclSymTrait< MultTrait_t<RES,ST> >
6766 , DeclDiagTrait< MultTrait_t<RES,ST> >
6767 , DeclLowTrait< MultTrait_t<RES,ST> > >
6769 , DeclUppTrait< MultTrait_t<RES,ST> >
6770 , MultTrait<RES,ST> > > > >::Type;
6775 using SIMDType = SIMDTrait_t<ElementType>;
6780 using LeftOperand =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
6786 using LT = If_t< evaluateLeft, const RT1, CT1 >;
6789 using RT = If_t< evaluateRight, const RT2, CT2 >;
6795 ( !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2> &&
6796 MT1::simdEnabled && MT2::simdEnabled &&
6797 IsSIMDCombinable_v<ET1,ET2,ST> &&
6798 HasSIMDAdd_v<ET1,ET2> &&
6799 HasSIMDMult_v<ET1,ET2> );
6803 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
6849 if( j >=
matrix_.columns() ) {
6852 return (*
this)(i,j);
6861 inline size_t rows()
const {
6871 inline size_t columns()
const {
6902 template<
typename T >
6903 inline bool canAlias(
const T* alias )
const {
6904 return matrix_.canAlias( alias );
6914 template<
typename T >
6915 inline bool isAliased(
const T* alias )
const {
6916 return matrix_.isAliased( alias );
6937 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
6939 (
rows() *
columns() < DMATTDMATMULT_THRESHOLD ) ) &&
6940 (
rows() *
columns() >= SMP_DMATTDMATMULT_THRESHOLD );
6962 template<
typename MT
6971 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6972 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6974 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
6977 else if( left.columns() == 0UL ) {
6992 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
7007 template<
typename MT3
7011 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7013 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
7014 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
7015 selectSmallAssignKernel( C, A, B, scalar );
7017 selectBlasAssignKernel( C, A, B, scalar );
7035 template<
typename MT3
7039 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7040 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7042 const size_t M( A.rows() );
7043 const size_t N( B.columns() );
7044 const size_t K( A.columns() );
7048 const size_t ibegin( ( IsStrictlyLower_v<MT4> )
7049 ?( ( IsStrictlyLower_v<MT5> && M > 1UL ) ? 2UL : 1UL )
7051 const size_t iend( ( IsStrictlyUpper_v<MT4> )
7052 ?( ( IsStrictlyUpper_v<MT5> && M > 1UL ) ? M-2UL : M-1UL )
7056 for(
size_t i=0UL; i<ibegin; ++i ) {
7057 for(
size_t j=0UL; j<N; ++j ) {
7061 for(
size_t i=ibegin; i<iend; ++i )
7063 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7064 ?( ( IsStrictlyUpper_v<MT4> )
7065 ?( IsStrictlyUpper_v<MT5> ? i+2UL : i+1UL )
7066 :( IsStrictlyUpper_v<MT5> ? i+1UL : i ) )
7067 :( ( IsStrictlyUpper_v<MT5> )
7068 ?( SYM || HERM || UPP ?
max( i, 1UL ) : 1UL )
7069 :( SYM || HERM || UPP ? i : 0UL ) ) );
7070 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
7071 ?( ( IsStrictlyLower_v<MT4> )
7072 ?( IsStrictlyLower_v<MT5> ? i-1UL : i )
7073 :( IsStrictlyLower_v<MT5> ? i : i+1UL ) )
7074 :( ( IsStrictlyLower_v<MT5> )
7075 ?( LOW ?
min(i+1UL,N-1UL) : N-1UL )
7076 :( LOW ? i+1UL : N ) ) );
7078 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) ) {
7079 for(
size_t j=0UL; j<N; ++j ) {
7087 for(
size_t j=( SYM || HERM ? i : 0UL ); j<jbegin; ++j ) {
7090 for(
size_t j=jbegin; j<jend; ++j )
7092 const size_t kbegin( ( IsUpper_v<MT4> )
7093 ?( ( IsLower_v<MT5> )
7094 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7095 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7096 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7097 :( ( IsLower_v<MT5> )
7098 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7100 const size_t kend( ( IsLower_v<MT4> )
7101 ?( ( IsUpper_v<MT5> )
7102 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
7103 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7104 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7105 :( ( IsUpper_v<MT5> )
7106 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7110 C(i,j) = A(i,kbegin) * B(kbegin,j);
7111 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
7112 C(i,j) += A(i,k) * B(k,j);
7116 for(
size_t j=jend; j<N; ++j ) {
7120 for(
size_t i=iend; i<M; ++i ) {
7121 for(
size_t j=0UL; j<N; ++j ) {
7127 for(
size_t i=1UL; i<M; ++i ) {
7128 for(
size_t j=0UL; j<i; ++j ) {
7129 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7150 template<
typename MT3
7154 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7155 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7157 const size_t M( A.rows() );
7158 const size_t N( B.columns() );
7159 const size_t K( A.columns() );
7163 const size_t jbegin( ( IsStrictlyUpper_v<MT5> )
7164 ?( ( IsStrictlyUpper_v<MT4> && N > 1UL ) ? 2UL : 1UL )
7166 const size_t jend( ( IsStrictlyLower_v<MT5> )
7167 ?( ( IsStrictlyLower_v<MT4> && N > 1UL ) ? N-2UL : N-1UL )
7171 for(
size_t j=0UL; j<jbegin; ++j ) {
7172 for(
size_t i=0UL; i<M; ++i ) {
7176 for(
size_t j=jbegin; j<jend; ++j )
7178 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
7179 ?( ( IsStrictlyLower_v<MT4> )
7180 ?( IsStrictlyLower_v<MT5> ? j+2UL : j+1UL )
7181 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7182 :( ( IsStrictlyLower_v<MT4> )
7183 ?( SYM || HERM || LOW ?
max( j, 1UL ) : 1UL )
7184 :( SYM || HERM || LOW ? j : 0UL ) ) );
7185 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
7186 ?( ( IsStrictlyUpper_v<MT4> )
7187 ?( ( IsStrictlyUpper_v<MT5> )?( j-1UL ):( j ) )
7188 :( ( IsStrictlyUpper_v<MT5> )?( j ):( j+1UL ) ) )
7189 :( ( IsStrictlyUpper_v<MT4> )
7190 ?( UPP ?
min(j+1UL,M-1UL) : M-1UL )
7191 :( UPP ? j+1UL : M ) ) );
7193 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) ) {
7194 for(
size_t i=0UL; i<M; ++i ) {
7202 for(
size_t i=( SYM || HERM ? j : 0UL ); i<ibegin; ++i ) {
7205 for(
size_t i=ibegin; i<iend; ++i )
7207 const size_t kbegin( ( IsUpper_v<MT4> )
7208 ?( ( IsLower_v<MT5> )
7209 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7210 , ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7211 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7212 :( ( IsLower_v<MT5> )
7213 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7215 const size_t kend( ( IsLower_v<MT4> )
7216 ?( ( IsUpper_v<MT5> )
7217 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL )
7218 , ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7219 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7220 :( ( IsUpper_v<MT5> )
7221 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7225 C(i,j) = A(i,kbegin) * B(kbegin,j);
7226 for(
size_t k=kbegin+1UL; k<kend; ++k ) {
7227 C(i,j) += A(i,k) * B(k,j);
7231 for(
size_t i=iend; i<M; ++i ) {
7235 for(
size_t j=jend; j<N; ++j ) {
7236 for(
size_t i=0UL; i<M; ++i ) {
7242 for(
size_t j=1UL; j<N; ++j ) {
7243 for(
size_t i=0UL; i<j; ++i ) {
7244 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
7265 template<
typename MT3
7269 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7270 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7272 const size_t M( A.rows() );
7273 const size_t N( B.columns() );
7275 for(
size_t i=0UL; i<M; ++i )
7277 const size_t jbegin( ( IsUpper_v<MT4> )
7278 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7280 const size_t jend( ( IsLower_v<MT4> )
7281 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7285 if( IsUpper_v<MT4> ) {
7286 for(
size_t j=0UL; j<jbegin; ++j ) {
7290 for(
size_t j=jbegin; j<jend; ++j ) {
7291 C(i,j) = A(i,j) * B(j,j) * scalar;
7293 if( IsLower_v<MT4> ) {
7294 for(
size_t j=jend; j<N; ++j ) {
7316 template<
typename MT3
7320 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7321 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7323 constexpr size_t block( BLOCK_SIZE );
7325 const size_t M( A.rows() );
7326 const size_t N( B.columns() );
7328 for(
size_t jj=0UL; jj<N; jj+=block ) {
7329 const size_t jend(
min( N, jj+block ) );
7330 for(
size_t ii=0UL; ii<M; ii+=block ) {
7331 const size_t iend(
min( M, ii+block ) );
7332 for(
size_t j=jj; j<jend; ++j )
7334 const size_t ibegin( ( IsLower_v<MT4> )
7335 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
7337 const size_t ipos( ( IsUpper_v<MT4> )
7338 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
7341 if( IsLower_v<MT4> ) {
7342 for(
size_t i=ii; i<ibegin; ++i ) {
7346 for(
size_t i=ibegin; i<ipos; ++i ) {
7347 C(i,j) = A(i,j) * B(j,j) * scalar;
7349 if( IsUpper_v<MT4> ) {
7350 for(
size_t i=ipos; i<iend; ++i ) {
7374 template<
typename MT3
7378 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7379 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7381 constexpr size_t block( BLOCK_SIZE );
7383 const size_t M( A.rows() );
7384 const size_t N( B.columns() );
7386 for(
size_t ii=0UL; ii<M; ii+=block ) {
7387 const size_t iend(
min( M, ii+block ) );
7388 for(
size_t jj=0UL; jj<N; jj+=block ) {
7389 const size_t jend(
min( N, jj+block ) );
7390 for(
size_t i=ii; i<iend; ++i )
7392 const size_t jbegin( ( IsUpper_v<MT5> )
7393 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
7395 const size_t jpos( ( IsLower_v<MT5> )
7396 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
7399 if( IsUpper_v<MT5> ) {
7400 for(
size_t j=jj; j<jbegin; ++j ) {
7404 for(
size_t j=jbegin; j<jpos; ++j ) {
7405 C(i,j) = A(i,i) * B(i,j) * scalar;
7407 if( IsLower_v<MT5> ) {
7408 for(
size_t j=jpos; j<jend; ++j ) {
7432 template<
typename MT3
7436 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7437 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7439 const size_t M( A.rows() );
7440 const size_t N( B.columns() );
7442 for(
size_t j=0UL; j<N; ++j )
7444 const size_t ibegin( ( IsLower_v<MT5> )
7445 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
7447 const size_t iend( ( IsUpper_v<MT5> )
7448 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
7452 if( IsLower_v<MT5> ) {
7453 for(
size_t i=0UL; i<ibegin; ++i ) {
7457 for(
size_t i=ibegin; i<iend; ++i ) {
7458 C(i,j) = A(i,i) * B(i,j) * scalar;
7460 if( IsUpper_v<MT5> ) {
7461 for(
size_t i=iend; i<M; ++i ) {
7483 template<
typename MT3
7487 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7488 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7492 for(
size_t i=0UL; i<A.rows(); ++i ) {
7493 C(i,i) = A(i,i) * B(i,i) * scalar;
7512 template<
typename MT3
7516 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7517 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7519 selectDefaultAssignKernel( C, A, B, scalar );
7538 template<
typename MT3
7542 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7543 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7545 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
7547 const size_t M( A.rows() );
7548 const size_t N( B.columns() );
7549 const size_t K( A.columns() );
7555 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
7557 const size_t jend( LOW ? i+3UL : N );
7562 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
7563 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
7564 C(i+2UL,j) = HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
7570 reset( C(i+1UL,j) );
7571 reset( C(i+2UL,j) );
7575 for( ; (j+3UL) <= jend; j+=3UL )
7577 const size_t kbegin( ( IsUpper_v<MT4> )
7580 const size_t kend( ( IsLower_v<MT4> )
7581 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
7582 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
7591 SIMDType a1( A.load(i ,k) );
7592 SIMDType a2( A.load(i+1UL,k) );
7593 SIMDType a3( A.load(i+2UL,k) );
7594 SIMDType b1( B.load(k,j ) );
7595 SIMDType b2( B.load(k,j+1UL) );
7596 SIMDType b3( B.load(k,j+2UL) );
7597 SIMDType xmm1( a1 * b1 );
7598 SIMDType xmm2( a1 * b2 );
7599 SIMDType xmm3( a1 * b3 );
7600 SIMDType xmm4( a2 * b1 );
7601 SIMDType xmm5( a2 * b2 );
7602 SIMDType xmm6( a2 * b3 );
7603 SIMDType xmm7( a3 * b1 );
7604 SIMDType xmm8( a3 * b2 );
7605 SIMDType xmm9( a3 * b3 );
7609 a2 = A.load(i+1UL,k);
7610 a3 = A.load(i+2UL,k);
7612 b2 = B.load(k,j+1UL);
7613 b3 = B.load(k,j+2UL);
7625 C(i ,j ) =
sum( xmm1 ) * scalar;
7626 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
7627 C(i ,j+2UL) =
sum( xmm3 ) * scalar;
7628 C(i+1UL,j ) =
sum( xmm4 ) * scalar;
7629 C(i+1UL,j+1UL) =
sum( xmm5 ) * scalar;
7630 C(i+1UL,j+2UL) =
sum( xmm6 ) * scalar;
7631 C(i+2UL,j ) =
sum( xmm7 ) * scalar;
7632 C(i+2UL,j+1UL) =
sum( xmm8 ) * scalar;
7633 C(i+2UL,j+2UL) =
sum( xmm9 ) * scalar;
7635 for( ; remainder && k<kend; ++k ) {
7636 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7637 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7638 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
7639 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7640 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7641 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
7642 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
7643 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
7644 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
7659 for( ++k; k<kend; ++k ) {
7660 value1 += A(i ,k) * B(k,j );
7661 value2 += A(i ,k) * B(k,j+1UL);
7662 value3 += A(i ,k) * B(k,j+2UL);
7663 value4 += A(i+1UL,k) * B(k,j );
7664 value5 += A(i+1UL,k) * B(k,j+1UL);
7665 value6 += A(i+1UL,k) * B(k,j+2UL);
7666 value7 += A(i+2UL,k) * B(k,j );
7667 value8 += A(i+2UL,k) * B(k,j+1UL);
7668 value9 += A(i+2UL,k) * B(k,j+2UL);
7671 C(i ,j ) = value1 * scalar;
7672 C(i ,j+1UL) = value2 * scalar;
7673 C(i ,j+2UL) = value3 * scalar;
7674 C(i+1UL,j ) = value4 * scalar;
7675 C(i+1UL,j+1UL) = value5 * scalar;
7676 C(i+1UL,j+2UL) = value6 * scalar;
7677 C(i+2UL,j ) = value7 * scalar;
7678 C(i+2UL,j+1UL) = value8 * scalar;
7679 C(i+2UL,j+2UL) = value9 * scalar;
7684 reset( C(i ,j+1UL) );
7685 reset( C(i ,j+2UL) );
7686 reset( C(i+1UL,j ) );
7687 reset( C(i+1UL,j+1UL) );
7688 reset( C(i+1UL,j+2UL) );
7689 reset( C(i+2UL,j ) );
7690 reset( C(i+2UL,j+1UL) );
7691 reset( C(i+2UL,j+2UL) );
7695 for( ; (j+2UL) <= jend; j+=2UL )
7697 const size_t kbegin( ( IsUpper_v<MT4> )
7700 const size_t kend( ( IsLower_v<MT4> )
7701 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
7702 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
7711 SIMDType a1( A.load(i ,k) );
7712 SIMDType a2( A.load(i+1UL,k) );
7713 SIMDType a3( A.load(i+2UL,k) );
7714 SIMDType b1( B.load(k,j ) );
7715 SIMDType b2( B.load(k,j+1UL) );
7716 SIMDType xmm1( a1 * b1 );
7717 SIMDType xmm2( a1 * b2 );
7718 SIMDType xmm3( a2 * b1 );
7719 SIMDType xmm4( a2 * b2 );
7720 SIMDType xmm5( a3 * b1 );
7721 SIMDType xmm6( a3 * b2 );
7725 a2 = A.load(i+1UL,k);
7726 a3 = A.load(i+2UL,k);
7728 b2 = B.load(k,j+1UL);
7737 C(i ,j ) =
sum( xmm1 ) * scalar;
7738 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
7739 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
7740 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
7741 C(i+2UL,j ) =
sum( xmm5 ) * scalar;
7742 C(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
7744 for( ; remainder && k<kend; ++k ) {
7745 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7746 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7747 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7748 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7749 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
7750 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
7762 for( ++k; k<kend; ++k ) {
7763 value1 += A(i ,k) * B(k,j );
7764 value2 += A(i ,k) * B(k,j+1UL);
7765 value3 += A(i+1UL,k) * B(k,j );
7766 value4 += A(i+1UL,k) * B(k,j+1UL);
7767 value5 += A(i+2UL,k) * B(k,j );
7768 value6 += A(i+2UL,k) * B(k,j+1UL);
7771 C(i ,j ) = value1 * scalar;
7772 C(i ,j+1UL) = value2 * scalar;
7773 C(i+1UL,j ) = value3 * scalar;
7774 C(i+1UL,j+1UL) = value4 * scalar;
7775 C(i+2UL,j ) = value5 * scalar;
7776 C(i+2UL,j+1UL) = value6 * scalar;
7781 reset( C(i ,j+1UL) );
7782 reset( C(i+1UL,j ) );
7783 reset( C(i+1UL,j+1UL) );
7784 reset( C(i+2UL,j ) );
7785 reset( C(i+2UL,j+1UL) );
7791 const size_t kbegin( ( IsUpper_v<MT4> )
7794 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
7803 SIMDType b1( B.load(k,j) );
7804 SIMDType xmm1( A.load(i ,k) * b1 );
7805 SIMDType xmm2( A.load(i+1UL,k) * b1 );
7806 SIMDType xmm3( A.load(i+2UL,k) * b1 );
7810 xmm1 += A.load(i ,k) * b1;
7811 xmm2 += A.load(i+1UL,k) * b1;
7812 xmm3 += A.load(i+2UL,k) * b1;
7815 C(i ,j) =
sum( xmm1 ) * scalar;
7816 C(i+1UL,j) =
sum( xmm2 ) * scalar;
7817 C(i+2UL,j) =
sum( xmm3 ) * scalar;
7819 for( ; remainder && k<kend; ++k ) {
7820 C(i ,j) += A(i ,k) * B(k,j) * scalar;
7821 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
7822 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
7831 for( ++k; k<kend; ++k ) {
7832 value1 += A(i ,k) * B(k,j);
7833 value2 += A(i+1UL,k) * B(k,j);
7834 value3 += A(i+2UL,k) * B(k,j);
7837 C(i ,j) = value1 * scalar;
7838 C(i+1UL,j) = value2 * scalar;
7839 C(i+2UL,j) = value3 * scalar;
7844 reset( C(i+1UL,j) );
7845 reset( C(i+2UL,j) );
7854 reset( C(i+1UL,j) );
7855 reset( C(i+2UL,j) );
7860 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
7862 const size_t jend( LOW ? i+2UL : N );
7867 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
7868 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
7874 reset( C(i+1UL,j) );
7878 for( ; (j+4UL) <= jend; j+=4UL )
7880 const size_t kbegin( ( IsUpper_v<MT4> )
7883 const size_t kend( ( IsLower_v<MT4> )
7884 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
7885 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
7894 SIMDType a1( A.load(i ,k) );
7895 SIMDType a2( A.load(i+1UL,k) );
7896 SIMDType b1( B.load(k,j ) );
7897 SIMDType b2( B.load(k,j+1UL) );
7898 SIMDType b3( B.load(k,j+2UL) );
7899 SIMDType b4( B.load(k,j+3UL) );
7900 SIMDType xmm1( a1 * b1 );
7901 SIMDType xmm2( a1 * b2 );
7902 SIMDType xmm3( a1 * b3 );
7903 SIMDType xmm4( a1 * b4 );
7904 SIMDType xmm5( a2 * b1 );
7905 SIMDType xmm6( a2 * b2 );
7906 SIMDType xmm7( a2 * b3 );
7907 SIMDType xmm8( a2 * b4 );
7911 a2 = A.load(i+1UL,k);
7913 b2 = B.load(k,j+1UL);
7914 b3 = B.load(k,j+2UL);
7915 b4 = B.load(k,j+3UL);
7926 C(i ,j ) =
sum( xmm1 ) * scalar;
7927 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
7928 C(i ,j+2UL) =
sum( xmm3 ) * scalar;
7929 C(i ,j+3UL) =
sum( xmm4 ) * scalar;
7930 C(i+1UL,j ) =
sum( xmm5 ) * scalar;
7931 C(i+1UL,j+1UL) =
sum( xmm6 ) * scalar;
7932 C(i+1UL,j+2UL) =
sum( xmm7 ) * scalar;
7933 C(i+1UL,j+3UL) =
sum( xmm8 ) * scalar;
7935 for( ; remainder && k<kend; ++k ) {
7936 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
7937 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
7938 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
7939 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
7940 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
7941 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
7942 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
7943 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
7957 for( ++k; k<kend; ++k ) {
7958 value1 += A(i ,k) * B(k,j );
7959 value2 += A(i ,k) * B(k,j+1UL);
7960 value3 += A(i ,k) * B(k,j+2UL);
7961 value4 += A(i ,k) * B(k,j+3UL);
7962 value5 += A(i+1UL,k) * B(k,j );
7963 value6 += A(i+1UL,k) * B(k,j+1UL);
7964 value7 += A(i+1UL,k) * B(k,j+2UL);
7965 value8 += A(i+1UL,k) * B(k,j+3UL);
7968 C(i ,j ) = value1 * scalar;
7969 C(i ,j+1UL) = value2 * scalar;
7970 C(i ,j+2UL) = value3 * scalar;
7971 C(i ,j+3UL) = value4 * scalar;
7972 C(i+1UL,j ) = value5 * scalar;
7973 C(i+1UL,j+1UL) = value6 * scalar;
7974 C(i+1UL,j+2UL) = value7 * scalar;
7975 C(i+1UL,j+3UL) = value8 * scalar;
7980 reset( C(i ,j+1UL) );
7981 reset( C(i ,j+2UL) );
7982 reset( C(i ,j+3UL) );
7983 reset( C(i+1UL,j ) );
7984 reset( C(i+1UL,j+1UL) );
7985 reset( C(i+1UL,j+2UL) );
7986 reset( C(i+1UL,j+3UL) );
7990 for( ; (j+2UL) <= jend; j+=2UL )
7992 const size_t kbegin( ( IsUpper_v<MT4> )
7995 const size_t kend( ( IsLower_v<MT4> )
7996 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
7997 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8006 SIMDType a1( A.load(i ,k) );
8007 SIMDType a2( A.load(i+1UL,k) );
8008 SIMDType b1( B.load(k,j ) );
8009 SIMDType b2( B.load(k,j+1UL) );
8010 SIMDType xmm1( a1 * b1 );
8011 SIMDType xmm2( a1 * b2 );
8012 SIMDType xmm3( a2 * b1 );
8013 SIMDType xmm4( a2 * b2 );
8017 a2 = A.load(i+1UL,k);
8019 b2 = B.load(k,j+1UL);
8026 C(i ,j ) =
sum( xmm1 ) * scalar;
8027 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
8028 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
8029 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
8031 for( ; remainder && k<kend; ++k ) {
8032 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8033 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8034 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8035 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8045 for( ++k; k<kend; ++k ) {
8046 value1 += A(i ,k) * B(k,j );
8047 value2 += A(i ,k) * B(k,j+1UL);
8048 value3 += A(i+1UL,k) * B(k,j );
8049 value4 += A(i+1UL,k) * B(k,j+1UL);
8052 C(i ,j ) = value1 * scalar;
8053 C(i ,j+1UL) = value2 * scalar;
8054 C(i+1UL,j ) = value3 * scalar;
8055 C(i+1UL,j+1UL) = value4 * scalar;
8060 reset( C(i ,j+1UL) );
8061 reset( C(i+1UL,j ) );
8062 reset( C(i+1UL,j+1UL) );
8068 const size_t kbegin( ( IsUpper_v<MT4> )
8071 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
8080 SIMDType b1( B.load(k,j) );
8081 SIMDType xmm1( A.load(i ,k) * b1 );
8082 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8086 xmm1 += A.load(i ,k) * b1;
8087 xmm2 += A.load(i+1UL,k) * b1;
8090 C(i ,j) =
sum( xmm1 ) * scalar;
8091 C(i+1UL,j) =
sum( xmm2 ) * scalar;
8093 for( ; remainder && k<kend; ++k ) {
8094 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8095 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8103 for( ++k; k<kend; ++k ) {
8104 value1 += A(i ,k) * B(k,j);
8105 value2 += A(i+1UL,k) * B(k,j);
8108 C(i ,j) = value1 * scalar;
8109 C(i+1UL,j) = value2 * scalar;
8114 reset( C(i+1UL,j) );
8123 reset( C(i+1UL,j) );
8130 const size_t jend( LOW ? i+1UL : N );
8135 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
8144 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
8146 const size_t kbegin( ( IsUpper_v<MT4> )
8149 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
8158 SIMDType a1( A.load(i,k) );
8159 SIMDType xmm1( a1 * B.load(k,j ) );
8160 SIMDType xmm2( a1 * B.load(k,j+1UL) );
8161 SIMDType xmm3( a1 * B.load(k,j+2UL) );
8162 SIMDType xmm4( a1 * B.load(k,j+3UL) );
8166 xmm1 += a1 * B.load(k,j );
8167 xmm2 += a1 * B.load(k,j+1UL);
8168 xmm3 += a1 * B.load(k,j+2UL);
8169 xmm4 += a1 * B.load(k,j+3UL);
8172 C(i,j ) =
sum( xmm1 ) * scalar;
8173 C(i,j+1UL) =
sum( xmm2 ) * scalar;
8174 C(i,j+2UL) =
sum( xmm3 ) * scalar;
8175 C(i,j+3UL) =
sum( xmm4 ) * scalar;
8177 for( ; remainder && k<kend; ++k ) {
8178 C(i,j ) += A(i,k) * B(k,j ) * scalar;
8179 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
8180 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
8181 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
8191 for( ++k; k<kend; ++k ) {
8192 value1 += A(i,k) * B(k,j );
8193 value2 += A(i,k) * B(k,j+1UL);
8194 value3 += A(i,k) * B(k,j+2UL);
8195 value4 += A(i,k) * B(k,j+3UL);
8198 C(i,j ) = value1 * scalar;
8199 C(i,j+1UL) = value2 * scalar;
8200 C(i,j+2UL) = value3 * scalar;
8201 C(i,j+3UL) = value4 * scalar;
8206 reset( C(i,j+1UL) );
8207 reset( C(i,j+2UL) );
8208 reset( C(i,j+3UL) );
8212 for( ; !( LOW && UPP ) && (j+2UL) <= jend; j+=2UL )
8214 const size_t kbegin( ( IsUpper_v<MT4> )
8217 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
8226 SIMDType a1( A.load(i,k) );
8227 SIMDType xmm1( a1 * B.load(k,j ) );
8228 SIMDType xmm2( a1 * B.load(k,j+1UL) );
8232 xmm1 += a1 * B.load(k,j );
8233 xmm2 += a1 * B.load(k,j+1UL);
8236 C(i,j ) =
sum( xmm1 ) * scalar;
8237 C(i,j+1UL) =
sum( xmm2 ) * scalar;
8239 for( ; remainder && k<kend; ++k ) {
8240 C(i,j ) += A(i,k) * B(k,j ) * scalar;
8241 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
8249 for( ++k; k<kend; ++k ) {
8250 value1 += A(i,k) * B(k,j );
8251 value2 += A(i,k) * B(k,j+1UL);
8254 C(i,j ) = value1 * scalar;
8255 C(i,j+1UL) = value2 * scalar;
8260 reset( C(i,j+1UL) );
8266 const size_t kbegin( ( IsUpper_v<MT4> )
8277 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
8280 xmm1 += A.load(i,k) * B.load(k,j);
8283 C(i,j) =
sum( xmm1 ) * scalar;
8285 for( ; remainder && k<K; ++k ) {
8286 C(i,j) += A(i,k) * B(k,j) * scalar;
8293 for( ++k; k<K; ++k ) {
8294 value += A(i,k) * B(k,j);
8297 C(i,j) = value * scalar;
8331 template<
typename MT3
8335 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8336 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8338 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
8340 const size_t M( A.rows() );
8341 const size_t N( B.columns() );
8342 const size_t K( A.columns() );
8348 for( ; !( LOW && UPP ) && (i+4UL) <= M; i+=4UL )
8350 const size_t jend( LOW ? i+4UL : N );
8355 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
8356 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
8357 C(i+2UL,j) = HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
8358 C(i+3UL,j) = HERM ?
conj( C(j,i+3UL) ) : C(j,i+3UL);
8364 reset( C(i+1UL,j) );
8365 reset( C(i+2UL,j) );
8366 reset( C(i+3UL,j) );
8370 for( ; (j+2UL) <= jend; j+=2UL )
8372 const size_t kbegin( ( IsUpper_v<MT4> )
8375 const size_t kend( ( IsLower_v<MT4> )
8376 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
8377 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8386 SIMDType a1( A.load(i ,k) );
8387 SIMDType a2( A.load(i+1UL,k) );
8388 SIMDType a3( A.load(i+2UL,k) );
8389 SIMDType a4( A.load(i+3UL,k) );
8390 SIMDType b1( B.load(k,j ) );
8391 SIMDType b2( B.load(k,j+1UL) );
8392 SIMDType xmm1( a1 * b1 );
8393 SIMDType xmm2( a1 * b2 );
8394 SIMDType xmm3( a2 * b1 );
8395 SIMDType xmm4( a2 * b2 );
8396 SIMDType xmm5( a3 * b1 );
8397 SIMDType xmm6( a3 * b2 );
8398 SIMDType xmm7( a4 * b1 );
8399 SIMDType xmm8( a4 * b2 );
8403 a2 = A.load(i+1UL,k);
8404 a3 = A.load(i+2UL,k);
8405 a4 = A.load(i+3UL,k);
8407 b2 = B.load(k,j+1UL);
8418 C(i ,j ) =
sum( xmm1 ) * scalar;
8419 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
8420 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
8421 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
8422 C(i+2UL,j ) =
sum( xmm5 ) * scalar;
8423 C(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
8424 C(i+3UL,j ) =
sum( xmm7 ) * scalar;
8425 C(i+3UL,j+1UL) =
sum( xmm8 ) * scalar;
8427 for( ; remainder && k<kend; ++k ) {
8428 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8429 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8430 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8431 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8432 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8433 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8434 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
8435 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
8449 for( ++k; k<kend; ++k ) {
8450 value1 += A(i ,k) * B(k,j );
8451 value2 += A(i ,k) * B(k,j+1UL);
8452 value3 += A(i+1UL,k) * B(k,j );
8453 value4 += A(i+1UL,k) * B(k,j+1UL);
8454 value5 += A(i+2UL,k) * B(k,j );
8455 value6 += A(i+2UL,k) * B(k,j+1UL);
8456 value7 += A(i+3UL,k) * B(k,j );
8457 value8 += A(i+3UL,k) * B(k,j+1UL);
8460 C(i ,j ) = value1 * scalar;
8461 C(i ,j+1UL) = value2 * scalar;
8462 C(i+1UL,j ) = value3 * scalar;
8463 C(i+1UL,j+1UL) = value4 * scalar;
8464 C(i+2UL,j ) = value5 * scalar;
8465 C(i+2UL,j+1UL) = value6 * scalar;
8466 C(i+3UL,j ) = value7 * scalar;
8467 C(i+3UL,j+1UL) = value8 * scalar;
8472 reset( C(i ,j+1UL) );
8473 reset( C(i+1UL,j ) );
8474 reset( C(i+1UL,j+1UL) );
8475 reset( C(i+2UL,j ) );
8476 reset( C(i+2UL,j+1UL) );
8477 reset( C(i+3UL,j ) );
8478 reset( C(i+3UL,j+1UL) );
8484 const size_t kbegin( ( IsUpper_v<MT4> )
8487 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
8496 SIMDType b1( B.load(k,j) );
8497 SIMDType xmm1( A.load(i ,k) * b1 );
8498 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8499 SIMDType xmm3( A.load(i+2UL,k) * b1 );
8500 SIMDType xmm4( A.load(i+3UL,k) * b1 );
8504 xmm1 += A.load(i ,k) * b1;
8505 xmm2 += A.load(i+1UL,k) * b1;
8506 xmm3 += A.load(i+2UL,k) * b1;
8507 xmm4 += A.load(i+3UL,k) * b1;
8510 C(i ,j) =
sum( xmm1 ) * scalar;
8511 C(i+1UL,j) =
sum( xmm2 ) * scalar;
8512 C(i+2UL,j) =
sum( xmm3 ) * scalar;
8513 C(i+3UL,j) =
sum( xmm4 ) * scalar;
8515 for( ; remainder && k<kend; ++k ) {
8516 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8517 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8518 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
8519 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
8529 for( ++k; k<kend; ++k ) {
8530 value1 += A(i ,k) * B(k,j);
8531 value2 += A(i+1UL,k) * B(k,j);
8532 value3 += A(i+2UL,k) * B(k,j);
8533 value4 += A(i+3UL,k) * B(k,j);
8536 C(i ,j) = value1 * scalar;
8537 C(i+1UL,j) = value2 * scalar;
8538 C(i+2UL,j) = value3 * scalar;
8539 C(i+3UL,j) = value4 * scalar;
8544 reset( C(i+1UL,j) );
8545 reset( C(i+2UL,j) );
8546 reset( C(i+3UL,j) );
8555 reset( C(i+1UL,j) );
8556 reset( C(i+2UL,j) );
8557 reset( C(i+3UL,j) );
8562 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
8564 const size_t jend( LOW ? i+3UL : N );
8569 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
8570 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
8571 C(i+2UL,j) = HERM ?
conj( C(j,i+2UL) ) : C(j,i+2UL);
8577 reset( C(i+1UL,j) );
8578 reset( C(i+2UL,j) );
8582 for( ; (j+3UL) <= jend; j+=3UL )
8584 const size_t kbegin( ( IsUpper_v<MT4> )
8587 const size_t kend( ( IsLower_v<MT4> )
8588 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
8589 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
8598 SIMDType a1( A.load(i ,k) );
8599 SIMDType a2( A.load(i+1UL,k) );
8600 SIMDType a3( A.load(i+2UL,k) );
8601 SIMDType b1( B.load(k,j ) );
8602 SIMDType b2( B.load(k,j+1UL) );
8603 SIMDType b3( B.load(k,j+2UL) );
8604 SIMDType xmm1( a1 * b1 );
8605 SIMDType xmm2( a1 * b2 );
8606 SIMDType xmm3( a1 * b3 );
8607 SIMDType xmm4( a2 * b1 );
8608 SIMDType xmm5( a2 * b2 );
8609 SIMDType xmm6( a2 * b3 );
8610 SIMDType xmm7( a3 * b1 );
8611 SIMDType xmm8( a3 * b2 );
8612 SIMDType xmm9( a3 * b3 );
8616 a2 = A.load(i+1UL,k);
8617 a3 = A.load(i+2UL,k);
8619 b2 = B.load(k,j+1UL);
8620 b3 = B.load(k,j+2UL);
8632 C(i ,j ) =
sum( xmm1 ) * scalar;
8633 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
8634 C(i ,j+2UL) =
sum( xmm3 ) * scalar;
8635 C(i+1UL,j ) =
sum( xmm4 ) * scalar;
8636 C(i+1UL,j+1UL) =
sum( xmm5 ) * scalar;
8637 C(i+1UL,j+2UL) =
sum( xmm6 ) * scalar;
8638 C(i+2UL,j ) =
sum( xmm7 ) * scalar;
8639 C(i+2UL,j+1UL) =
sum( xmm8 ) * scalar;
8640 C(i+2UL,j+2UL) =
sum( xmm9 ) * scalar;
8642 for( ; remainder && k<kend; ++k ) {
8643 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8644 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8645 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
8646 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8647 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8648 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
8649 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8650 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8651 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
8666 for( ++k; k<kend; ++k ) {
8667 value1 += A(i ,k) * B(k,j );
8668 value2 += A(i ,k) * B(k,j+1UL);
8669 value3 += A(i ,k) * B(k,j+2UL);
8670 value4 += A(i+1UL,k) * B(k,j );
8671 value5 += A(i+1UL,k) * B(k,j+1UL);
8672 value6 += A(i+1UL,k) * B(k,j+2UL);
8673 value7 += A(i+2UL,k) * B(k,j );
8674 value8 += A(i+2UL,k) * B(k,j+1UL);
8675 value9 += A(i+2UL,k) * B(k,j+2UL);
8678 C(i ,j ) = value1 * scalar;
8679 C(i ,j+1UL) = value2 * scalar;
8680 C(i ,j+2UL) = value3 * scalar;
8681 C(i+1UL,j ) = value4 * scalar;
8682 C(i+1UL,j+1UL) = value5 * scalar;
8683 C(i+1UL,j+2UL) = value6 * scalar;
8684 C(i+2UL,j ) = value7 * scalar;
8685 C(i+2UL,j+1UL) = value8 * scalar;
8686 C(i+2UL,j+2UL) = value9 * scalar;
8691 reset( C(i ,j+1UL) );
8692 reset( C(i ,j+2UL) );
8693 reset( C(i+1UL,j ) );
8694 reset( C(i+1UL,j+1UL) );
8695 reset( C(i+1UL,j+2UL) );
8696 reset( C(i+2UL,j ) );
8697 reset( C(i+2UL,j+1UL) );
8698 reset( C(i+2UL,j+2UL) );
8702 for( ; (j+2UL) <= jend; j+=2UL )
8704 const size_t kbegin( ( IsUpper_v<MT4> )
8707 const size_t kend( ( IsLower_v<MT4> )
8708 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
8709 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8718 SIMDType a1( A.load(i ,k) );
8719 SIMDType a2( A.load(i+1UL,k) );
8720 SIMDType a3( A.load(i+2UL,k) );
8721 SIMDType b1( B.load(k,j ) );
8722 SIMDType b2( B.load(k,j+1UL) );
8723 SIMDType xmm1( a1 * b1 );
8724 SIMDType xmm2( a1 * b2 );
8725 SIMDType xmm3( a2 * b1 );
8726 SIMDType xmm4( a2 * b2 );
8727 SIMDType xmm5( a3 * b1 );
8728 SIMDType xmm6( a3 * b2 );
8732 a2 = A.load(i+1UL,k);
8733 a3 = A.load(i+2UL,k);
8735 b2 = B.load(k,j+1UL);
8744 C(i ,j ) =
sum( xmm1 ) * scalar;
8745 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
8746 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
8747 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
8748 C(i+2UL,j ) =
sum( xmm5 ) * scalar;
8749 C(i+2UL,j+1UL) =
sum( xmm6 ) * scalar;
8751 for( ; remainder && k<kend; ++k ) {
8752 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8753 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8754 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8755 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8756 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
8757 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
8769 for( ++k; k<kend; ++k ) {
8770 value1 += A(i ,k) * B(k,j );
8771 value2 += A(i ,k) * B(k,j+1UL);
8772 value3 += A(i+1UL,k) * B(k,j );
8773 value4 += A(i+1UL,k) * B(k,j+1UL);
8774 value5 += A(i+2UL,k) * B(k,j );
8775 value6 += A(i+2UL,k) * B(k,j+1UL);
8778 C(i ,j ) = value1 * scalar;
8779 C(i ,j+1UL) = value2 * scalar;
8780 C(i+1UL,j ) = value3 * scalar;
8781 C(i+1UL,j+1UL) = value4 * scalar;
8782 C(i+2UL,j ) = value5 * scalar;
8783 C(i+2UL,j+1UL) = value6 * scalar;
8788 reset( C(i ,j+1UL) );
8789 reset( C(i+1UL,j ) );
8790 reset( C(i+1UL,j+1UL) );
8791 reset( C(i+2UL,j ) );
8792 reset( C(i+2UL,j+1UL) );
8798 const size_t kbegin( ( IsUpper_v<MT4> )
8801 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
8810 SIMDType b1( B.load(k,j) );
8811 SIMDType xmm1( A.load(i ,k) * b1 );
8812 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8813 SIMDType xmm3( A.load(i+2UL,k) * b1 );
8817 xmm1 += A.load(i ,k) * b1;
8818 xmm2 += A.load(i+1UL,k) * b1;
8819 xmm3 += A.load(i+2UL,k) * b1;
8822 C(i ,j) =
sum( xmm1 ) * scalar;
8823 C(i+1UL,j) =
sum( xmm2 ) * scalar;
8824 C(i+2UL,j) =
sum( xmm3 ) * scalar;
8826 for( ; remainder && k<kend; ++k ) {
8827 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8828 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8829 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
8838 for( ++k; k<kend; ++k ) {
8839 value1 += A(i ,k) * B(k,j);
8840 value2 += A(i+1UL,k) * B(k,j);
8841 value3 += A(i+2UL,k) * B(k,j);
8844 C(i ,j) = value1 * scalar;
8845 C(i+1UL,j) = value2 * scalar;
8846 C(i+2UL,j) = value3 * scalar;
8851 reset( C(i+1UL,j) );
8852 reset( C(i+2UL,j) );
8861 reset( C(i+1UL,j) );
8862 reset( C(i+2UL,j) );
8867 for( ; (i+2UL) <= M; i+=2UL )
8869 const size_t jend( LOW ? i+2UL : N );
8874 C(i ,j) = HERM ?
conj( C(j,i ) ) : C(j,i );
8875 C(i+1UL,j) = HERM ?
conj( C(j,i+1UL) ) : C(j,i+1UL);
8881 reset( C(i+1UL,j) );
8885 for( ; (j+2UL) <= jend; j+=2UL )
8887 const size_t kbegin( ( IsUpper_v<MT4> )
8890 const size_t kend( ( IsLower_v<MT4> )
8891 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
8892 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
8901 SIMDType a1( A.load(i ,k) );
8902 SIMDType a2( A.load(i+1UL,k) );
8903 SIMDType b1( B.load(k,j ) );
8904 SIMDType b2( B.load(k,j+1UL) );
8905 SIMDType xmm1( a1 * b1 );
8906 SIMDType xmm2( a1 * b2 );
8907 SIMDType xmm3( a2 * b1 );
8908 SIMDType xmm4( a2 * b2 );
8912 a2 = A.load(i+1UL,k);
8914 b2 = B.load(k,j+1UL);
8921 C(i ,j ) =
sum( xmm1 ) * scalar;
8922 C(i ,j+1UL) =
sum( xmm2 ) * scalar;
8923 C(i+1UL,j ) =
sum( xmm3 ) * scalar;
8924 C(i+1UL,j+1UL) =
sum( xmm4 ) * scalar;
8926 for( ; remainder && k<kend; ++k ) {
8927 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
8928 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
8929 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
8930 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
8940 for( ++k; k<kend; ++k ) {
8941 value1 += A(i ,k) * B(k,j );
8942 value2 += A(i ,k) * B(k,j+1UL);
8943 value3 += A(i+1UL,k) * B(k,j );
8944 value4 += A(i+1UL,k) * B(k,j+1UL);
8947 C(i ,j ) = value1 * scalar;
8948 C(i ,j+1UL) = value2 * scalar;
8949 C(i+1UL,j ) = value3 * scalar;
8950 C(i+1UL,j+1UL) = value4 * scalar;
8955 reset( C(i ,j+1UL) );
8956 reset( C(i+1UL,j ) );
8957 reset( C(i+1UL,j+1UL) );
8963 const size_t kbegin( ( IsUpper_v<MT4> )
8966 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
8975 SIMDType b1( B.load(k,j) );
8976 SIMDType xmm1( A.load(i ,k) * b1 );
8977 SIMDType xmm2( A.load(i+1UL,k) * b1 );
8981 xmm1 += A.load(i ,k) * b1;
8982 xmm2 += A.load(i+1UL,k) * b1;
8985 C(i ,j) =
sum( xmm1 ) * scalar;
8986 C(i+1UL,j) =
sum( xmm2 ) * scalar;
8988 for( ; remainder && k<kend; ++k ) {
8989 C(i ,j) += A(i ,k) * B(k,j) * scalar;
8990 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
8998 for( ++k; k<kend; ++k ) {
8999 value1 += A(i ,k) * B(k,j);
9000 value2 += A(i+1UL,k) * B(k,j);
9003 C(i ,j) = value1 * scalar;
9004 C(i+1UL,j) = value2 * scalar;
9009 reset( C(i+1UL,j) );
9018 reset( C(i+1UL,j) );
9025 const size_t jend( LOW ? i+1UL : N );
9030 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
9039 for( ; (j+2UL) <= jend; j+=2UL )
9041 const size_t kbegin( ( IsUpper_v<MT4> )
9044 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
9053 SIMDType a1( A.load(i,k) );
9054 SIMDType xmm1( a1 * B.load(k,j ) );
9055 SIMDType xmm2( a1 * B.load(k,j+1UL) );
9059 xmm1 += a1 * B.load(k,j );
9060 xmm2 += a1 * B.load(k,j+1UL);
9063 C(i,j ) =
sum( xmm1 ) * scalar;
9064 C(i,j+1UL) =
sum( xmm2 ) * scalar;
9066 for( ; remainder && k<kend; ++k ) {
9067 C(i,j ) += A(i,k) * B(k,j ) * scalar;
9068 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
9076 for( ++k; k<kend; ++k ) {
9077 value1 += A(i,k) * B(k,j );
9078 value2 += A(i,k) * B(k,j+1UL);
9081 C(i,j ) = value1 * scalar;
9082 C(i,j+1UL) = value2 * scalar;
9087 reset( C(i,j+1UL) );
9093 const size_t kbegin( ( IsUpper_v<MT4> )
9104 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
9107 xmm1 += A.load(i,k) * B.load(k,j);
9110 C(i,j) =
sum( xmm1 ) * scalar;
9112 for( ; remainder && k<K; ++k ) {
9113 C(i,j) += A(i,k) * B(k,j) * scalar;
9120 for( ++k; k<K; ++k ) {
9121 value += A(i,k) * B(k,j);
9124 C(i,j) = value * scalar;
9157 template<
typename MT3
9161 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9162 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9164 selectDefaultAssignKernel( C, A, B, scalar );
9183 template<
typename MT3
9187 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9188 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9191 smmm( C, A, B, scalar );
9193 hmmm( C, A, B, scalar );
9195 lmmm( C, A, B, scalar, ST2(0) );
9197 ummm( C, A, B, scalar, ST2(0) );
9199 mmm( C, A, B, scalar, ST2(0) );
9217 template<
typename MT3
9221 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9222 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9224 selectLargeAssignKernel( C, A, B, scalar );
9229#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9243 template<
typename MT3
9247 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9248 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9250 using ET = ElementType_t<MT3>;
9252 if( IsTriangular_v<MT4> ) {
9254 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9256 else if( IsTriangular_v<MT5> ) {
9258 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9261 gemm( C, A, B,
ET(scalar),
ET(0) );
9279 template<
typename MT
9285 using TmpType = If_t< SO, OppositeType, ResultType >;
9297 const ForwardFunctor fwd;
9299 const TmpType tmp(
serial( rhs ) );
9300 assign( *lhs, fwd( tmp ) );
9316 template<
typename MT
9318 friend inline void addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9325 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9326 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9328 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9342 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
9357 template<
typename MT3
9361 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9363 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
9364 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
9365 selectSmallAddAssignKernel( C, A, B, scalar );
9367 selectBlasAddAssignKernel( C, A, B, scalar );
9385 template<
typename MT3
9389 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9390 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9393 addAssign( C, tmp );
9411 template<
typename MT3
9415 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9416 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9418 const size_t M( A.rows() );
9419 const size_t N( B.columns() );
9421 for(
size_t i=0UL; i<M; ++i )
9423 const size_t jbegin( ( IsUpper_v<MT4> )
9424 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
9426 const size_t jend( ( IsLower_v<MT4> )
9427 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
9431 const size_t jnum( jend - jbegin );
9432 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
9435 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
9436 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
9437 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
9440 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
9460 template<
typename MT3
9464 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9465 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9467 constexpr size_t block( BLOCK_SIZE );
9469 const size_t M( A.rows() );
9470 const size_t N( B.columns() );
9472 for(
size_t jj=0UL; jj<N; jj+=block ) {
9473 const size_t jend(
min( N, jj+block ) );
9474 for(
size_t ii=0UL; ii<M; ii+=block ) {
9475 const size_t iend(
min( M, ii+block ) );
9476 for(
size_t j=jj; j<jend; ++j )
9478 const size_t ibegin( ( IsLower_v<MT4> )
9479 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
9481 const size_t ipos( ( IsUpper_v<MT4> )
9482 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
9485 for(
size_t i=ibegin; i<ipos; ++i ) {
9486 C(i,j) += A(i,j) * B(j,j) * scalar;
9508 template<
typename MT3
9512 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9513 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9515 constexpr size_t block( BLOCK_SIZE );
9517 const size_t M( A.rows() );
9518 const size_t N( B.columns() );
9520 for(
size_t ii=0UL; ii<M; ii+=block ) {
9521 const size_t iend(
min( M, ii+block ) );
9522 for(
size_t jj=0UL; jj<N; jj+=block ) {
9523 const size_t jend(
min( N, jj+block ) );
9524 for(
size_t i=ii; i<iend; ++i )
9526 const size_t jbegin( ( IsUpper_v<MT5> )
9527 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
9529 const size_t jpos( ( IsLower_v<MT5> )
9530 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
9533 for(
size_t j=jbegin; j<jpos; ++j ) {
9534 C(i,j) += A(i,i) * B(i,j) * scalar;
9556 template<
typename MT3
9560 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9561 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
9563 const size_t M( A.rows() );
9564 const size_t N( B.columns() );
9566 for(
size_t j=0UL; j<N; ++j )
9568 const size_t ibegin( ( IsLower_v<MT5> )
9569 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
9571 const size_t iend( ( IsUpper_v<MT5> )
9572 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
9576 const size_t inum( iend - ibegin );
9577 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
9580 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
9581 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
9582 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
9585 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
9605 template<
typename MT3
9609 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9610 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
9612 for(
size_t i=0UL; i<A.rows(); ++i ) {
9613 C(i,i) += A(i,i) * B(i,i) * scalar;
9632 template<
typename MT3
9636 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9637 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9639 selectDefaultAddAssignKernel( C, A, B, scalar );
9658 template<
typename MT3
9662 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9663 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9665 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
9667 const size_t M( A.rows() );
9668 const size_t N( B.columns() );
9669 const size_t K( A.columns() );
9675 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
9677 const size_t jend( LOW ? i+3UL : N );
9678 size_t j( UPP ? i : 0UL );
9680 for( ; (j+3UL) <= jend; j+=3UL )
9682 const size_t kbegin( ( IsUpper_v<MT4> )
9685 const size_t kend( ( IsLower_v<MT4> )
9686 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
9687 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
9696 SIMDType a1( A.load(i ,k) );
9697 SIMDType a2( A.load(i+1UL,k) );
9698 SIMDType a3( A.load(i+2UL,k) );
9699 SIMDType b1( B.load(k,j ) );
9700 SIMDType b2( B.load(k,j+1UL) );
9701 SIMDType b3( B.load(k,j+2UL) );
9702 SIMDType xmm1( a1 * b1 );
9703 SIMDType xmm2( a1 * b2 );
9704 SIMDType xmm3( a1 * b3 );
9705 SIMDType xmm4( a2 * b1 );
9706 SIMDType xmm5( a2 * b2 );
9707 SIMDType xmm6( a2 * b3 );
9708 SIMDType xmm7( a3 * b1 );
9709 SIMDType xmm8( a3 * b2 );
9710 SIMDType xmm9( a3 * b3 );
9714 a2 = A.load(i+1UL,k);
9715 a3 = A.load(i+2UL,k);
9717 b2 = B.load(k,j+1UL);
9718 b3 = B.load(k,j+2UL);
9730 C(i ,j ) +=
sum( xmm1 ) * scalar;
9731 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
9732 C(i ,j+2UL) +=
sum( xmm3 ) * scalar;
9733 C(i+1UL,j ) +=
sum( xmm4 ) * scalar;
9734 C(i+1UL,j+1UL) +=
sum( xmm5 ) * scalar;
9735 C(i+1UL,j+2UL) +=
sum( xmm6 ) * scalar;
9736 C(i+2UL,j ) +=
sum( xmm7 ) * scalar;
9737 C(i+2UL,j+1UL) +=
sum( xmm8 ) * scalar;
9738 C(i+2UL,j+2UL) +=
sum( xmm9 ) * scalar;
9740 for( ; remainder && k<kend; ++k ) {
9741 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9742 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9743 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
9744 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9745 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9746 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
9747 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
9748 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
9749 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
9764 for( ++k; k<kend; ++k ) {
9765 value1 += A(i ,k) * B(k,j );
9766 value2 += A(i ,k) * B(k,j+1UL);
9767 value3 += A(i ,k) * B(k,j+2UL);
9768 value4 += A(i+1UL,k) * B(k,j );
9769 value5 += A(i+1UL,k) * B(k,j+1UL);
9770 value6 += A(i+1UL,k) * B(k,j+2UL);
9771 value7 += A(i+2UL,k) * B(k,j );
9772 value8 += A(i+2UL,k) * B(k,j+1UL);
9773 value9 += A(i+2UL,k) * B(k,j+2UL);
9776 C(i ,j ) += value1 * scalar;
9777 C(i ,j+1UL) += value2 * scalar;
9778 C(i ,j+2UL) += value3 * scalar;
9779 C(i+1UL,j ) += value4 * scalar;
9780 C(i+1UL,j+1UL) += value5 * scalar;
9781 C(i+1UL,j+2UL) += value6 * scalar;
9782 C(i+2UL,j ) += value7 * scalar;
9783 C(i+2UL,j+1UL) += value8 * scalar;
9784 C(i+2UL,j+2UL) += value9 * scalar;
9788 for( ; (j+2UL) <= jend; j+=2UL )
9790 const size_t kbegin( ( IsUpper_v<MT4> )
9793 const size_t kend( ( IsLower_v<MT4> )
9794 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
9795 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
9804 SIMDType a1( A.load(i ,k) );
9805 SIMDType a2( A.load(i+1UL,k) );
9806 SIMDType a3( A.load(i+2UL,k) );
9807 SIMDType b1( B.load(k,j ) );
9808 SIMDType b2( B.load(k,j+1UL) );
9809 SIMDType xmm1( a1 * b1 );
9810 SIMDType xmm2( a1 * b2 );
9811 SIMDType xmm3( a2 * b1 );
9812 SIMDType xmm4( a2 * b2 );
9813 SIMDType xmm5( a3 * b1 );
9814 SIMDType xmm6( a3 * b2 );
9818 a2 = A.load(i+1UL,k);
9819 a3 = A.load(i+2UL,k);
9821 b2 = B.load(k,j+1UL);
9830 C(i ,j ) +=
sum( xmm1 ) * scalar;
9831 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
9832 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
9833 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
9834 C(i+2UL,j ) +=
sum( xmm5 ) * scalar;
9835 C(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
9837 for( ; remainder && k<kend; ++k ) {
9838 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9839 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9840 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9841 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9842 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
9843 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
9855 for( ++k; k<kend; ++k ) {
9856 value1 += A(i ,k) * B(k,j );
9857 value2 += A(i ,k) * B(k,j+1UL);
9858 value3 += A(i+1UL,k) * B(k,j );
9859 value4 += A(i+1UL,k) * B(k,j+1UL);
9860 value5 += A(i+2UL,k) * B(k,j );
9861 value6 += A(i+2UL,k) * B(k,j+1UL);
9864 C(i ,j ) += value1 * scalar;
9865 C(i ,j+1UL) += value2 * scalar;
9866 C(i+1UL,j ) += value3 * scalar;
9867 C(i+1UL,j+1UL) += value4 * scalar;
9868 C(i+2UL,j ) += value5 * scalar;
9869 C(i+2UL,j+1UL) += value6 * scalar;
9875 const size_t kbegin( ( IsUpper_v<MT4> )
9878 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
9887 SIMDType b1( B.load(k,j) );
9888 SIMDType xmm1( A.load(i ,k) * b1 );
9889 SIMDType xmm2( A.load(i+1UL,k) * b1 );
9890 SIMDType xmm3( A.load(i+2UL,k) * b1 );
9894 xmm1 += A.load(i ,k) * b1;
9895 xmm2 += A.load(i+1UL,k) * b1;
9896 xmm3 += A.load(i+2UL,k) * b1;
9899 C(i ,j) +=
sum( xmm1 ) * scalar;
9900 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
9901 C(i+2UL,j) +=
sum( xmm3 ) * scalar;
9903 for( ; remainder && k<kend; ++k ) {
9904 C(i ,j) += A(i ,k) * B(k,j) * scalar;
9905 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
9906 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
9915 for( ++k; k<kend; ++k ) {
9916 value1 += A(i ,k) * B(k,j);
9917 value2 += A(i+1UL,k) * B(k,j);
9918 value3 += A(i+2UL,k) * B(k,j);
9921 C(i ,j) += value1 * scalar;
9922 C(i+1UL,j) += value2 * scalar;
9923 C(i+2UL,j) += value3 * scalar;
9928 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
9930 const size_t jend( LOW ? i+2UL : N );
9931 size_t j( UPP ? i : 0UL );
9933 for( ; (j+4UL) <= jend; j+=4UL )
9935 const size_t kbegin( ( IsUpper_v<MT4> )
9938 const size_t kend( ( IsLower_v<MT4> )
9939 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
9940 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
9949 SIMDType a1( A.load(i ,k) );
9950 SIMDType a2( A.load(i+1UL,k) );
9951 SIMDType b1( B.load(k,j ) );
9952 SIMDType b2( B.load(k,j+1UL) );
9953 SIMDType b3( B.load(k,j+2UL) );
9954 SIMDType b4( B.load(k,j+3UL) );
9955 SIMDType xmm1( a1 * b1 );
9956 SIMDType xmm2( a1 * b2 );
9957 SIMDType xmm3( a1 * b3 );
9958 SIMDType xmm4( a1 * b4 );
9959 SIMDType xmm5( a2 * b1 );
9960 SIMDType xmm6( a2 * b2 );
9961 SIMDType xmm7( a2 * b3 );
9962 SIMDType xmm8( a2 * b4 );
9966 a2 = A.load(i+1UL,k);
9968 b2 = B.load(k,j+1UL);
9969 b3 = B.load(k,j+2UL);
9970 b4 = B.load(k,j+3UL);
9981 C(i ,j ) +=
sum( xmm1 ) * scalar;
9982 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
9983 C(i ,j+2UL) +=
sum( xmm3 ) * scalar;
9984 C(i ,j+3UL) +=
sum( xmm4 ) * scalar;
9985 C(i+1UL,j ) +=
sum( xmm5 ) * scalar;
9986 C(i+1UL,j+1UL) +=
sum( xmm6 ) * scalar;
9987 C(i+1UL,j+2UL) +=
sum( xmm7 ) * scalar;
9988 C(i+1UL,j+3UL) +=
sum( xmm8 ) * scalar;
9990 for( ; remainder && k<kend; ++k ) {
9991 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
9992 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
9993 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
9994 C(i ,j+3UL) += A(i ,k) * B(k,j+3UL) * scalar;
9995 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
9996 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
9997 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
9998 C(i+1UL,j+3UL) += A(i+1UL,k) * B(k,j+3UL) * scalar;
10001 else if( k < kend )
10012 for( ++k; k<kend; ++k ) {
10013 value1 += A(i ,k) * B(k,j );
10014 value2 += A(i ,k) * B(k,j+1UL);
10015 value3 += A(i ,k) * B(k,j+2UL);
10016 value4 += A(i ,k) * B(k,j+3UL);
10017 value5 += A(i+1UL,k) * B(k,j );
10018 value6 += A(i+1UL,k) * B(k,j+1UL);
10019 value7 += A(i+1UL,k) * B(k,j+2UL);
10020 value8 += A(i+1UL,k) * B(k,j+3UL);
10023 C(i ,j ) += value1 * scalar;
10024 C(i ,j+1UL) += value2 * scalar;
10025 C(i ,j+2UL) += value3 * scalar;
10026 C(i ,j+3UL) += value4 * scalar;
10027 C(i+1UL,j ) += value5 * scalar;
10028 C(i+1UL,j+1UL) += value6 * scalar;
10029 C(i+1UL,j+2UL) += value7 * scalar;
10030 C(i+1UL,j+3UL) += value8 * scalar;
10034 for( ; (j+2UL) <= jend; j+=2UL )
10036 const size_t kbegin( ( IsUpper_v<MT4> )
10039 const size_t kend( ( IsLower_v<MT4> )
10040 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
10041 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10046 size_t k( kbegin );
10050 SIMDType a1( A.load(i ,k) );
10051 SIMDType a2( A.load(i+1UL,k) );
10052 SIMDType b1( B.load(k,j ) );
10053 SIMDType b2( B.load(k,j+1UL) );
10054 SIMDType xmm1( a1 * b1 );
10055 SIMDType xmm2( a1 * b2 );
10056 SIMDType xmm3( a2 * b1 );
10057 SIMDType xmm4( a2 * b2 );
10061 a2 = A.load(i+1UL,k);
10063 b2 = B.load(k,j+1UL);
10070 C(i ,j ) +=
sum( xmm1 ) * scalar;
10071 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
10072 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
10073 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
10075 for( ; remainder && k<kend; ++k ) {
10076 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10077 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10078 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10079 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10082 else if( k < kend )
10089 for( ++k; k<kend; ++k ) {
10090 value1 += A(i ,k) * B(k,j );
10091 value2 += A(i ,k) * B(k,j+1UL);
10092 value3 += A(i+1UL,k) * B(k,j );
10093 value4 += A(i+1UL,k) * B(k,j+1UL);
10096 C(i ,j ) += value1 * scalar;
10097 C(i ,j+1UL) += value2 * scalar;
10098 C(i+1UL,j ) += value3 * scalar;
10099 C(i+1UL,j+1UL) += value4 * scalar;
10105 const size_t kbegin( ( IsUpper_v<MT4> )
10108 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
10113 size_t k( kbegin );
10117 SIMDType b1( B.load(k,j) );
10118 SIMDType xmm1( A.load(i ,k) * b1 );
10119 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10123 xmm1 += A.load(i ,k) * b1;
10124 xmm2 += A.load(i+1UL,k) * b1;
10127 C(i ,j) +=
sum( xmm1 ) * scalar;
10128 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
10130 for( ; remainder && k<kend; ++k ) {
10131 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10132 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10135 else if( k < kend )
10140 for( ++k; k<kend; ++k ) {
10141 value1 += A(i ,k) * B(k,j);
10142 value2 += A(i+1UL,k) * B(k,j);
10145 C(i ,j) += value1 * scalar;
10146 C(i+1UL,j) += value2 * scalar;
10153 const size_t jend( LOW ? i+1UL : N );
10154 size_t j( UPP ? i : 0UL );
10156 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
10158 const size_t kbegin( ( IsUpper_v<MT4> )
10161 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
10166 size_t k( kbegin );
10170 SIMDType a1( A.load(i,k) );
10171 SIMDType xmm1( a1 * B.load(k,j ) );
10172 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10173 SIMDType xmm3( a1 * B.load(k,j+2UL) );
10174 SIMDType xmm4( a1 * B.load(k,j+3UL) );
10178 xmm1 += a1 * B.load(k,j );
10179 xmm2 += a1 * B.load(k,j+1UL);
10180 xmm3 += a1 * B.load(k,j+2UL);
10181 xmm4 += a1 * B.load(k,j+3UL);
10184 C(i,j ) +=
sum( xmm1 ) * scalar;
10185 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
10186 C(i,j+2UL) +=
sum( xmm3 ) * scalar;
10187 C(i,j+3UL) +=
sum( xmm4 ) * scalar;
10189 for( ; remainder && k<kend; ++k ) {
10190 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10191 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10192 C(i,j+2UL) += A(i,k) * B(k,j+2UL) * scalar;
10193 C(i,j+3UL) += A(i,k) * B(k,j+3UL) * scalar;
10196 else if( k < kend )
10203 for( ++k; k<kend; ++k ) {
10204 value1 += A(i,k) * B(k,j );
10205 value2 += A(i,k) * B(k,j+1UL);
10206 value3 += A(i,k) * B(k,j+2UL);
10207 value4 += A(i,k) * B(k,j+3UL);
10210 C(i,j ) += value1 * scalar;
10211 C(i,j+1UL) += value2 * scalar;
10212 C(i,j+2UL) += value3 * scalar;
10213 C(i,j+3UL) += value4 * scalar;
10217 for( ; (j+2UL) <= jend; j+=2UL )
10219 const size_t kbegin( ( IsUpper_v<MT4> )
10222 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
10227 size_t k( kbegin );
10231 SIMDType a1( A.load(i,k) );
10232 SIMDType xmm1( a1 * B.load(k,j ) );
10233 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10237 xmm1 += a1 * B.load(k,j );
10238 xmm2 += a1 * B.load(k,j+1UL);
10241 C(i,j ) +=
sum( xmm1 ) * scalar;
10242 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
10244 for( ; remainder && k<kend; ++k ) {
10245 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10246 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10249 else if( k < kend )
10254 for( ++k; k<kend; ++k ) {
10255 value1 += A(i,k) * B(k,j );
10256 value2 += A(i,k) * B(k,j+1UL);
10259 C(i,j ) += value1 * scalar;
10260 C(i,j+1UL) += value2 * scalar;
10266 const size_t kbegin( ( IsUpper_v<MT4> )
10273 size_t k( kbegin );
10277 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
10280 xmm1 += A.load(i,k) * B.load(k,j);
10283 C(i,j) +=
sum( xmm1 ) * scalar;
10285 for( ; remainder && k<K; ++k ) {
10286 C(i,j) += A(i,k) * B(k,j) * scalar;
10293 for( ++k; k<K; ++k ) {
10294 value += A(i,k) * B(k,j);
10297 C(i,j) += value * scalar;
10319 template<
typename MT3
10323 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10324 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10326 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
10328 const size_t M( A.rows() );
10329 const size_t N( B.columns() );
10330 const size_t K( A.columns() );
10336 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
10340 for( ; (j+2UL) <= N; j+=2UL )
10342 const size_t kbegin( ( IsUpper_v<MT4> )
10345 const size_t kend( ( IsLower_v<MT4> )
10346 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
10347 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10352 size_t k( kbegin );
10356 SIMDType a1( A.load(i ,k) );
10357 SIMDType a2( A.load(i+1UL,k) );
10358 SIMDType a3( A.load(i+2UL,k) );
10359 SIMDType a4( A.load(i+3UL,k) );
10360 SIMDType b1( B.load(k,j ) );
10361 SIMDType b2( B.load(k,j+1UL) );
10362 SIMDType xmm1( a1 * b1 );
10363 SIMDType xmm2( a1 * b2 );
10364 SIMDType xmm3( a2 * b1 );
10365 SIMDType xmm4( a2 * b2 );
10366 SIMDType xmm5( a3 * b1 );
10367 SIMDType xmm6( a3 * b2 );
10368 SIMDType xmm7( a4 * b1 );
10369 SIMDType xmm8( a4 * b2 );
10373 a2 = A.load(i+1UL,k);
10374 a3 = A.load(i+2UL,k);
10375 a4 = A.load(i+3UL,k);
10377 b2 = B.load(k,j+1UL);
10388 C(i ,j ) +=
sum( xmm1 ) * scalar;
10389 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
10390 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
10391 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
10392 C(i+2UL,j ) +=
sum( xmm5 ) * scalar;
10393 C(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
10394 C(i+3UL,j ) +=
sum( xmm7 ) * scalar;
10395 C(i+3UL,j+1UL) +=
sum( xmm8 ) * scalar;
10397 for( ; remainder && k<kend; ++k ) {
10398 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10399 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10400 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10401 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10402 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10403 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10404 C(i+3UL,j ) += A(i+3UL,k) * B(k,j ) * scalar;
10405 C(i+3UL,j+1UL) += A(i+3UL,k) * B(k,j+1UL) * scalar;
10408 else if( k < kend )
10419 for( ++k; k<kend; ++k ) {
10420 value1 += A(i ,k) * B(k,j );
10421 value2 += A(i ,k) * B(k,j+1UL);
10422 value3 += A(i+1UL,k) * B(k,j );
10423 value4 += A(i+1UL,k) * B(k,j+1UL);
10424 value5 += A(i+2UL,k) * B(k,j );
10425 value6 += A(i+2UL,k) * B(k,j+1UL);
10426 value7 += A(i+3UL,k) * B(k,j );
10427 value8 += A(i+3UL,k) * B(k,j+1UL);
10430 C(i ,j ) += value1 * scalar;
10431 C(i ,j+1UL) += value2 * scalar;
10432 C(i+1UL,j ) += value3 * scalar;
10433 C(i+1UL,j+1UL) += value4 * scalar;
10434 C(i+2UL,j ) += value5 * scalar;
10435 C(i+2UL,j+1UL) += value6 * scalar;
10436 C(i+3UL,j ) += value7 * scalar;
10437 C(i+3UL,j+1UL) += value8 * scalar;
10443 const size_t kbegin( ( IsUpper_v<MT4> )
10446 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
10451 size_t k( kbegin );
10455 SIMDType b1( B.load(k,j) );
10456 SIMDType xmm1( A.load(i ,k) * b1 );
10457 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10458 SIMDType xmm3( A.load(i+2UL,k) * b1 );
10459 SIMDType xmm4( A.load(i+3UL,k) * b1 );
10463 xmm1 += A.load(i ,k) * b1;
10464 xmm2 += A.load(i+1UL,k) * b1;
10465 xmm3 += A.load(i+2UL,k) * b1;
10466 xmm4 += A.load(i+3UL,k) * b1;
10469 C(i ,j) +=
sum( xmm1 ) * scalar;
10470 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
10471 C(i+2UL,j) +=
sum( xmm3 ) * scalar;
10472 C(i+3UL,j) +=
sum( xmm4 ) * scalar;
10474 for( ; remainder && k<kend; ++k ) {
10475 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10476 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10477 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
10478 C(i+3UL,j) += A(i+3UL,k) * B(k,j) * scalar;
10481 else if( k < kend )
10488 for( ++k; k<kend; ++k ) {
10489 value1 += A(i ,k) * B(k,j);
10490 value2 += A(i+1UL,k) * B(k,j);
10491 value3 += A(i+2UL,k) * B(k,j);
10492 value4 += A(i+3UL,k) * B(k,j);
10495 C(i ,j) += value1 * scalar;
10496 C(i+1UL,j) += value2 * scalar;
10497 C(i+2UL,j) += value3 * scalar;
10498 C(i+3UL,j) += value4 * scalar;
10503 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
10507 for( ; (j+3UL) <= N; j+=3UL )
10509 const size_t kbegin( ( IsUpper_v<MT4> )
10512 const size_t kend( ( IsLower_v<MT4> )
10513 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
10514 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
10519 size_t k( kbegin );
10523 SIMDType a1( A.load(i ,k) );
10524 SIMDType a2( A.load(i+1UL,k) );
10525 SIMDType a3( A.load(i+2UL,k) );
10526 SIMDType b1( B.load(k,j ) );
10527 SIMDType b2( B.load(k,j+1UL) );
10528 SIMDType b3( B.load(k,j+2UL) );
10529 SIMDType xmm1( a1 * b1 );
10530 SIMDType xmm2( a1 * b2 );
10531 SIMDType xmm3( a1 * b3 );
10532 SIMDType xmm4( a2 * b1 );
10533 SIMDType xmm5( a2 * b2 );
10534 SIMDType xmm6( a2 * b3 );
10535 SIMDType xmm7( a3 * b1 );
10536 SIMDType xmm8( a3 * b2 );
10537 SIMDType xmm9( a3 * b3 );
10541 a2 = A.load(i+1UL,k);
10542 a3 = A.load(i+2UL,k);
10544 b2 = B.load(k,j+1UL);
10545 b3 = B.load(k,j+2UL);
10557 C(i ,j ) +=
sum( xmm1 ) * scalar;
10558 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
10559 C(i ,j+2UL) +=
sum( xmm3 ) * scalar;
10560 C(i+1UL,j ) +=
sum( xmm4 ) * scalar;
10561 C(i+1UL,j+1UL) +=
sum( xmm5 ) * scalar;
10562 C(i+1UL,j+2UL) +=
sum( xmm6 ) * scalar;
10563 C(i+2UL,j ) +=
sum( xmm7 ) * scalar;
10564 C(i+2UL,j+1UL) +=
sum( xmm8 ) * scalar;
10565 C(i+2UL,j+2UL) +=
sum( xmm9 ) * scalar;
10567 for( ; remainder && k<kend; ++k ) {
10568 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10569 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10570 C(i ,j+2UL) += A(i ,k) * B(k,j+2UL) * scalar;
10571 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10572 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10573 C(i+1UL,j+2UL) += A(i+1UL,k) * B(k,j+2UL) * scalar;
10574 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10575 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10576 C(i+2UL,j+2UL) += A(i+2UL,k) * B(k,j+2UL) * scalar;
10579 else if( k < kend )
10591 for( ++k; k<kend; ++k ) {
10592 value1 += A(i ,k) * B(k,j );
10593 value2 += A(i ,k) * B(k,j+1UL);
10594 value3 += A(i ,k) * B(k,j+2UL);
10595 value4 += A(i+1UL,k) * B(k,j );
10596 value5 += A(i+1UL,k) * B(k,j+1UL);
10597 value6 += A(i+1UL,k) * B(k,j+2UL);
10598 value7 += A(i+2UL,k) * B(k,j );
10599 value8 += A(i+2UL,k) * B(k,j+1UL);
10600 value9 += A(i+2UL,k) * B(k,j+2UL);
10603 C(i ,j ) += value1 * scalar;
10604 C(i ,j+1UL) += value2 * scalar;
10605 C(i ,j+2UL) += value3 * scalar;
10606 C(i+1UL,j ) += value4 * scalar;
10607 C(i+1UL,j+1UL) += value5 * scalar;
10608 C(i+1UL,j+2UL) += value6 * scalar;
10609 C(i+2UL,j ) += value7 * scalar;
10610 C(i+2UL,j+1UL) += value8 * scalar;
10611 C(i+2UL,j+2UL) += value9 * scalar;
10615 for( ; (j+2UL) <= N; j+=2UL )
10617 const size_t kbegin( ( IsUpper_v<MT4> )
10620 const size_t kend( ( IsLower_v<MT4> )
10621 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
10622 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10627 size_t k( kbegin );
10631 SIMDType a1( A.load(i ,k) );
10632 SIMDType a2( A.load(i+1UL,k) );
10633 SIMDType a3( A.load(i+2UL,k) );
10634 SIMDType b1( B.load(k,j ) );
10635 SIMDType b2( B.load(k,j+1UL) );
10636 SIMDType xmm1( a1 * b1 );
10637 SIMDType xmm2( a1 * b2 );
10638 SIMDType xmm3( a2 * b1 );
10639 SIMDType xmm4( a2 * b2 );
10640 SIMDType xmm5( a3 * b1 );
10641 SIMDType xmm6( a3 * b2 );
10645 a2 = A.load(i+1UL,k);
10646 a3 = A.load(i+2UL,k);
10648 b2 = B.load(k,j+1UL);
10657 C(i ,j ) +=
sum( xmm1 ) * scalar;
10658 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
10659 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
10660 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
10661 C(i+2UL,j ) +=
sum( xmm5 ) * scalar;
10662 C(i+2UL,j+1UL) +=
sum( xmm6 ) * scalar;
10664 for( ; remainder && k<kend; ++k ) {
10665 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10666 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10667 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10668 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10669 C(i+2UL,j ) += A(i+2UL,k) * B(k,j ) * scalar;
10670 C(i+2UL,j+1UL) += A(i+2UL,k) * B(k,j+1UL) * scalar;
10673 else if( k < kend )
10682 for( ++k; k<kend; ++k ) {
10683 value1 += A(i ,k) * B(k,j );
10684 value2 += A(i ,k) * B(k,j+1UL);
10685 value3 += A(i+1UL,k) * B(k,j );
10686 value4 += A(i+1UL,k) * B(k,j+1UL);
10687 value5 += A(i+2UL,k) * B(k,j );
10688 value6 += A(i+2UL,k) * B(k,j+1UL);
10691 C(i ,j ) += value1 * scalar;
10692 C(i ,j+1UL) += value2 * scalar;
10693 C(i+1UL,j ) += value3 * scalar;
10694 C(i+1UL,j+1UL) += value4 * scalar;
10695 C(i+2UL,j ) += value5 * scalar;
10696 C(i+2UL,j+1UL) += value6 * scalar;
10702 const size_t kbegin( ( IsUpper_v<MT4> )
10705 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
10710 size_t k( kbegin );
10714 SIMDType b1( B.load(k,j) );
10715 SIMDType xmm1( A.load(i ,k) * b1 );
10716 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10717 SIMDType xmm3( A.load(i+2UL,k) * b1 );
10721 xmm1 += A.load(i ,k) * b1;
10722 xmm2 += A.load(i+1UL,k) * b1;
10723 xmm3 += A.load(i+2UL,k) * b1;
10726 C(i ,j) +=
sum( xmm1 ) * scalar;
10727 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
10728 C(i+2UL,j) +=
sum( xmm3 ) * scalar;
10730 for( ; remainder && k<kend; ++k ) {
10731 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10732 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10733 C(i+2UL,j) += A(i+2UL,k) * B(k,j) * scalar;
10736 else if( k < kend )
10742 for( ++k; k<kend; ++k ) {
10743 value1 += A(i ,k) * B(k,j);
10744 value2 += A(i+1UL,k) * B(k,j);
10745 value3 += A(i+2UL,k) * B(k,j);
10748 C(i ,j) += value1 * scalar;
10749 C(i+1UL,j) += value2 * scalar;
10750 C(i+2UL,j) += value3 * scalar;
10755 for( ; (i+2UL) <= M; i+=2UL )
10757 const size_t jend( LOW ? i+2UL : N );
10758 size_t j( UPP ? i : 0UL );
10760 for( ; (j+2UL) <= jend; j+=2UL )
10762 const size_t kbegin( ( IsUpper_v<MT4> )
10765 const size_t kend( ( IsLower_v<MT4> )
10766 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
10767 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
10772 size_t k( kbegin );
10776 SIMDType a1( A.load(i ,k) );
10777 SIMDType a2( A.load(i+1UL,k) );
10778 SIMDType b1( B.load(k,j ) );
10779 SIMDType b2( B.load(k,j+1UL) );
10780 SIMDType xmm1( a1 * b1 );
10781 SIMDType xmm2( a1 * b2 );
10782 SIMDType xmm3( a2 * b1 );
10783 SIMDType xmm4( a2 * b2 );
10787 a2 = A.load(i+1UL,k);
10789 b2 = B.load(k,j+1UL);
10796 C(i ,j ) +=
sum( xmm1 ) * scalar;
10797 C(i ,j+1UL) +=
sum( xmm2 ) * scalar;
10798 C(i+1UL,j ) +=
sum( xmm3 ) * scalar;
10799 C(i+1UL,j+1UL) +=
sum( xmm4 ) * scalar;
10801 for( ; remainder && k<kend; ++k ) {
10802 C(i ,j ) += A(i ,k) * B(k,j ) * scalar;
10803 C(i ,j+1UL) += A(i ,k) * B(k,j+1UL) * scalar;
10804 C(i+1UL,j ) += A(i+1UL,k) * B(k,j ) * scalar;
10805 C(i+1UL,j+1UL) += A(i+1UL,k) * B(k,j+1UL) * scalar;
10808 else if( k < kend )
10815 for( ++k; k<kend; ++k ) {
10816 value1 += A(i ,k) * B(k,j );
10817 value2 += A(i ,k) * B(k,j+1UL);
10818 value3 += A(i+1UL,k) * B(k,j );
10819 value4 += A(i+1UL,k) * B(k,j+1UL);
10822 C(i ,j ) += value1 * scalar;
10823 C(i ,j+1UL) += value2 * scalar;
10824 C(i+1UL,j ) += value3 * scalar;
10825 C(i+1UL,j+1UL) += value4 * scalar;
10831 const size_t kbegin( ( IsUpper_v<MT4> )
10834 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
10839 size_t k( kbegin );
10843 SIMDType b1( B.load(k,j) );
10844 SIMDType xmm1( A.load(i ,k) * b1 );
10845 SIMDType xmm2( A.load(i+1UL,k) * b1 );
10849 xmm1 += A.load(i ,k) * b1;
10850 xmm2 += A.load(i+1UL,k) * b1;
10853 C(i ,j) +=
sum( xmm1 ) * scalar;
10854 C(i+1UL,j) +=
sum( xmm2 ) * scalar;
10856 for( ; remainder && k<kend; ++k ) {
10857 C(i ,j) += A(i ,k) * B(k,j) * scalar;
10858 C(i+1UL,j) += A(i+1UL,k) * B(k,j) * scalar;
10861 else if( k < kend )
10866 for( ++k; k<kend; ++k ) {
10867 value1 += A(i ,k) * B(k,j);
10868 value2 += A(i+1UL,k) * B(k,j);
10871 C(i ,j) += value1 * scalar;
10872 C(i+1UL,j) += value2 * scalar;
10879 const size_t jend( LOW ? i+1UL : N );
10880 size_t j( UPP ? i : 0UL );
10882 for( ; (j+2UL) <= jend; j+=2UL )
10884 const size_t kbegin( ( IsUpper_v<MT4> )
10887 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
10892 size_t k( kbegin );
10896 SIMDType a1( A.load(i,k) );
10897 SIMDType xmm1( a1 * B.load(k,j ) );
10898 SIMDType xmm2( a1 * B.load(k,j+1UL) );
10902 xmm1 += a1 * B.load(k,j );
10903 xmm2 += a1 * B.load(k,j+1UL);
10906 C(i,j ) +=
sum( xmm1 ) * scalar;
10907 C(i,j+1UL) +=
sum( xmm2 ) * scalar;
10909 for( ; remainder && k<kend; ++k ) {
10910 C(i,j ) += A(i,k) * B(k,j ) * scalar;
10911 C(i,j+1UL) += A(i,k) * B(k,j+1UL) * scalar;
10914 else if( k < kend )
10919 for( ++k; k<kend; ++k ) {
10920 value1 += A(i,k) * B(k,j );
10921 value2 += A(i,k) * B(k,j+1UL);
10924 C(i,j ) += value1 * scalar;
10925 C(i,j+1UL) += value2 * scalar;
10931 const size_t kbegin( ( IsUpper_v<MT4> )
10938 size_t k( kbegin );
10942 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
10945 xmm1 += A.load(i,k) * B.load(k,j);
10948 C(i,j) +=
sum( xmm1 ) * scalar;
10950 for( ; remainder && k<K; ++k ) {
10951 C(i,j) += A(i,k) * B(k,j) * scalar;
10958 for( ++k; k<K; ++k ) {
10959 value += A(i,k) * B(k,j);
10962 C(i,j) += value * scalar;
10983 template<
typename MT3
10987 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
10988 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
10990 selectDefaultAddAssignKernel( C, A, B, scalar );
11009 template<
typename MT3
11013 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11014 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11017 lmmm( C, A, B, scalar, ST2(1) );
11019 ummm( C, A, B, scalar, ST2(1) );
11021 mmm( C, A, B, scalar, ST2(1) );
11039 template<
typename MT3
11043 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11044 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11046 selectLargeAddAssignKernel( C, A, B, scalar );
11051#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
11065 template<
typename MT3
11069 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11070 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
11072 using ET = ElementType_t<MT3>;
11074 if( IsTriangular_v<MT4> ) {
11075 ResultType_t<MT3> tmp(
serial( B ) );
11076 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11077 addAssign( C, tmp );
11079 else if( IsTriangular_v<MT5> ) {
11080 ResultType_t<MT3> tmp(
serial( A ) );
11081 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
11082 addAssign( C, tmp );
11085 gemm( C, A, B,
ET(scalar),
ET(1) );
11107 template<
typename MT
11109 friend inline void subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
11116 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
11117 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
11119 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
11133 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
11148 template<
typename MT3
11152 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11154 if( ( IsDiagonal_v<MT4> || IsDiagonal_v<MT5> ) ||
11155 ( C.rows() * C.columns() < DMATTDMATMULT_THRESHOLD ) )
11156 selectSmallSubAssignKernel( C, A, B, scalar );
11158 selectBlasSubAssignKernel( C, A, B, scalar );
11176 template<
typename MT3
11180 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11181 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11184 subAssign( C, tmp );
11202 template<
typename MT3
11206 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11207 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11209 const size_t M( A.rows() );
11210 const size_t N( B.columns() );
11212 for(
size_t i=0UL; i<M; ++i )
11214 const size_t jbegin( ( IsUpper_v<MT4> )
11215 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
11217 const size_t jend( ( IsLower_v<MT4> )
11218 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
11222 const size_t jnum( jend - jbegin );
11223 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
11226 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
11227 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
11228 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
11230 if( jpos < jend ) {
11231 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
11251 template<
typename MT3
11255 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11256 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11258 constexpr size_t block( BLOCK_SIZE );
11260 const size_t M( A.rows() );
11261 const size_t N( B.columns() );
11263 for(
size_t jj=0UL; jj<N; jj+=block ) {
11264 const size_t jend(
min( N, jj+block ) );
11265 for(
size_t ii=0UL; ii<M; ii+=block ) {
11266 const size_t iend(
min( M, ii+block ) );
11267 for(
size_t j=jj; j<jend; ++j )
11269 const size_t ibegin( ( IsLower_v<MT4> )
11270 ?(
max( ( IsStrictlyLower_v<MT4> ? j+1UL : j ), ii ) )
11272 const size_t ipos( ( IsUpper_v<MT4> )
11273 ?(
min( ( IsStrictlyUpper_v<MT4> ? j : j+1UL ), iend ) )
11276 for(
size_t i=ibegin; i<ipos; ++i ) {
11277 C(i,j) -= A(i,j) * B(j,j) * scalar;
11300 template<
typename MT3
11304 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11305 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11307 constexpr size_t block( BLOCK_SIZE );
11309 const size_t M( A.rows() );
11310 const size_t N( B.columns() );
11312 for(
size_t ii=0UL; ii<M; ii+=block ) {
11313 const size_t iend(
min( M, ii+block ) );
11314 for(
size_t jj=0UL; jj<N; jj+=block ) {
11315 const size_t jend(
min( N, jj+block ) );
11316 for(
size_t i=ii; i<iend; ++i )
11318 const size_t jbegin( ( IsUpper_v<MT5> )
11319 ?(
max( ( IsStrictlyUpper_v<MT5> ? i+1UL : i ), jj ) )
11321 const size_t jpos( ( IsLower_v<MT5> )
11322 ?(
min( ( IsStrictlyLower_v<MT5> ? i : i+1UL ), jend ) )
11325 for(
size_t j=jbegin; j<jpos; ++j ) {
11326 C(i,j) -= A(i,i) * B(i,j) * scalar;
11349 template<
typename MT3
11353 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11354 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
11356 const size_t M( A.rows() );
11357 const size_t N( B.columns() );
11359 for(
size_t j=0UL; j<N; ++j )
11361 const size_t ibegin( ( IsLower_v<MT5> )
11362 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
11364 const size_t iend( ( IsUpper_v<MT5> )
11365 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
11369 const size_t inum( iend - ibegin );
11370 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
11373 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
11374 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
11375 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
11377 if( ipos < iend ) {
11378 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
11398 template<
typename MT3
11402 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11403 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
11405 for(
size_t i=0UL; i<A.rows(); ++i ) {
11406 C(i,i) -= A(i,i) * B(i,i) * scalar;
11425 template<
typename MT3
11429 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11430 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11432 selectDefaultSubAssignKernel( C, A, B, scalar );
11451 template<
typename MT3
11455 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
11456 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
11458 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
11460 const size_t M( A.rows() );
11461 const size_t N( B.columns() );
11462 const size_t K( A.columns() );
11468 for( ; !( LOW && UPP ) && (i+3UL) <= M; i+=3UL )
11470 const size_t jend( LOW ? i+3UL : N );
11471 size_t j( UPP ? i : 0UL );
11473 for( ; (j+3UL) <= jend; j+=3UL )
11475 const size_t kbegin( ( IsUpper_v<MT4> )
11478 const size_t kend( ( IsLower_v<MT4> )
11479 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
11480 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
11485 size_t k( kbegin );
11489 SIMDType a1( A.load(i ,k) );
11490 SIMDType a2( A.load(i+1UL,k) );
11491 SIMDType a3( A.load(i+2UL,k) );
11492 SIMDType b1( B.load(k,j ) );
11493 SIMDType b2( B.load(k,j+1UL) );
11494 SIMDType b3( B.load(k,j+2UL) );
11495 SIMDType xmm1( a1 * b1 );
11496 SIMDType xmm2( a1 * b2 );
11497 SIMDType xmm3( a1 * b3 );
11498 SIMDType xmm4( a2 * b1 );
11499 SIMDType xmm5( a2 * b2 );
11500 SIMDType xmm6( a2 * b3 );
11501 SIMDType xmm7( a3 * b1 );
11502 SIMDType xmm8( a3 * b2 );
11503 SIMDType xmm9( a3 * b3 );
11507 a2 = A.load(i+1UL,k);
11508 a3 = A.load(i+2UL,k);
11510 b2 = B.load(k,j+1UL);
11511 b3 = B.load(k,j+2UL);
11523 C(i ,j ) -=
sum( xmm1 ) * scalar;
11524 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
11525 C(i ,j+2UL) -=
sum( xmm3 ) * scalar;
11526 C(i+1UL,j ) -=
sum( xmm4 ) * scalar;
11527 C(i+1UL,j+1UL) -=
sum( xmm5 ) * scalar;
11528 C(i+1UL,j+2UL) -=
sum( xmm6 ) * scalar;
11529 C(i+2UL,j ) -=
sum( xmm7 ) * scalar;
11530 C(i+2UL,j+1UL) -=
sum( xmm8 ) * scalar;
11531 C(i+2UL,j+2UL) -=
sum( xmm9 ) * scalar;
11533 for( ; remainder && k<kend; ++k ) {
11534 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11535 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11536 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
11537 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11538 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11539 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
11540 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
11541 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
11542 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL) * scalar;
11545 else if( k < kend )
11557 for( ++k; k<kend; ++k ) {
11558 value1 += A(i ,k) * B(k,j );
11559 value2 += A(i ,k) * B(k,j+1UL);
11560 value3 += A(i ,k) * B(k,j+2UL);
11561 value4 += A(i+1UL,k) * B(k,j );
11562 value5 += A(i+1UL,k) * B(k,j+1UL);
11563 value6 += A(i+1UL,k) * B(k,j+2UL);
11564 value7 += A(i+2UL,k) * B(k,j );
11565 value8 += A(i+2UL,k) * B(k,j+1UL);
11566 value9 += A(i+2UL,k) * B(k,j+2UL);
11569 C(i ,j ) -= value1 * scalar;
11570 C(i ,j+1UL) -= value2 * scalar;
11571 C(i ,j+2UL) -= value3 * scalar;
11572 C(i+1UL,j ) -= value4 * scalar;
11573 C(i+1UL,j+1UL) -= value5 * scalar;
11574 C(i+1UL,j+2UL) -= value6 * scalar;
11575 C(i+2UL,j ) -= value7 * scalar;
11576 C(i+2UL,j+1UL) -= value8 * scalar;
11577 C(i+2UL,j+2UL) -= value9 * scalar;
11581 for( ; (j+2UL) <= jend; j+=2UL )
11583 const size_t kbegin( ( IsUpper_v<MT4> )
11586 const size_t kend( ( IsLower_v<MT4> )
11587 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
11588 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
11593 size_t k( kbegin );
11597 SIMDType a1( A.load(i ,k) );
11598 SIMDType a2( A.load(i+1UL,k) );
11599 SIMDType a3( A.load(i+2UL,k) );
11600 SIMDType b1( B.load(k,j ) );
11601 SIMDType b2( B.load(k,j+1UL) );
11602 SIMDType xmm1( a1 * b1 );
11603 SIMDType xmm2( a1 * b2 );
11604 SIMDType xmm3( a2 * b1 );
11605 SIMDType xmm4( a2 * b2 );
11606 SIMDType xmm5( a3 * b1 );
11607 SIMDType xmm6( a3 * b2 );
11611 a2 = A.load(i+1UL,k);
11612 a3 = A.load(i+2UL,k);
11614 b2 = B.load(k,j+1UL);
11623 C(i ,j ) -=
sum( xmm1 ) * scalar;
11624 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
11625 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
11626 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
11627 C(i+2UL,j ) -=
sum( xmm5 ) * scalar;
11628 C(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
11630 for( ; remainder && k<kend; ++k ) {
11631 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11632 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11633 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11634 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11635 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
11636 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
11639 else if( k < kend )
11648 for( ++k; k<kend; ++k ) {
11649 value1 += A(i ,k) * B(k,j );
11650 value2 += A(i ,k) * B(k,j+1UL);
11651 value3 += A(i+1UL,k) * B(k,j );
11652 value4 += A(i+1UL,k) * B(k,j+1UL);
11653 value5 += A(i+2UL,k) * B(k,j );
11654 value6 += A(i+2UL,k) * B(k,j+1UL);
11657 C(i ,j ) -= value1 * scalar;
11658 C(i ,j+1UL) -= value2 * scalar;
11659 C(i+1UL,j ) -= value3 * scalar;
11660 C(i+1UL,j+1UL) -= value4 * scalar;
11661 C(i+2UL,j ) -= value5 * scalar;
11662 C(i+2UL,j+1UL) -= value6 * scalar;
11668 const size_t kbegin( ( IsUpper_v<MT4> )
11671 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
11676 size_t k( kbegin );
11680 SIMDType b1( B.load(k,j) );
11681 SIMDType xmm1( A.load(i ,k) * b1 );
11682 SIMDType xmm2( A.load(i+1UL,k) * b1 );
11683 SIMDType xmm3( A.load(i+2UL,k) * b1 );
11687 xmm1 += A.load(i ,k) * b1;
11688 xmm2 += A.load(i+1UL,k) * b1;
11689 xmm3 += A.load(i+2UL,k) * b1;
11692 C(i ,j) -=
sum( xmm1 ) * scalar;
11693 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
11694 C(i+2UL,j) -=
sum( xmm3 ) * scalar;
11696 for( ; remainder && k<kend; ++k ) {
11697 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
11698 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
11699 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
11702 else if( k < kend )
11708 for( ++k; k<kend; ++k ) {
11709 value1 += A(i ,k) * B(k,j);
11710 value2 += A(i+1UL,k) * B(k,j);
11711 value3 += A(i+2UL,k) * B(k,j);
11714 C(i ,j) -= value1 * scalar;
11715 C(i+1UL,j) -= value2 * scalar;
11716 C(i+2UL,j) -= value3 * scalar;
11721 for( ; !( LOW && UPP ) && (i+2UL) <= M; i+=2UL )
11723 const size_t jend( LOW ? i+2UL : N );
11724 size_t j( UPP ? i : 0UL );
11726 for( ; (j+4UL) <= jend; j+=4UL )
11728 const size_t kbegin( ( IsUpper_v<MT4> )
11731 const size_t kend( ( IsLower_v<MT4> )
11732 ?( IsUpper_v<MT5> ?
min( i+2UL, j+4UL ) : ( i+2UL ) )
11733 :( IsUpper_v<MT5> ? ( j+4UL ) : K ) );
11738 size_t k( kbegin );
11742 SIMDType a1( A.load(i ,k) );
11743 SIMDType a2( A.load(i+1UL,k) );
11744 SIMDType b1( B.load(k,j ) );
11745 SIMDType b2( B.load(k,j+1UL) );
11746 SIMDType b3( B.load(k,j+2UL) );
11747 SIMDType b4( B.load(k,j+3UL) );
11748 SIMDType xmm1( a1 * b1 );
11749 SIMDType xmm2( a1 * b2 );
11750 SIMDType xmm3( a1 * b3 );
11751 SIMDType xmm4( a1 * b4 );
11752 SIMDType xmm5( a2 * b1 );
11753 SIMDType xmm6( a2 * b2 );
11754 SIMDType xmm7( a2 * b3 );
11755 SIMDType xmm8( a2 * b4 );
11759 a2 = A.load(i+1UL,k);
11761 b2 = B.load(k,j+1UL);
11762 b3 = B.load(k,j+2UL);
11763 b4 = B.load(k,j+3UL);
11774 C(i ,j ) -=
sum( xmm1 ) * scalar;
11775 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
11776 C(i ,j+2UL) -=
sum( xmm3 ) * scalar;
11777 C(i ,j+3UL) -=
sum( xmm4 ) * scalar;
11778 C(i+1UL,j ) -=
sum( xmm5 ) * scalar;
11779 C(i+1UL,j+1UL) -=
sum( xmm6 ) * scalar;
11780 C(i+1UL,j+2UL) -=
sum( xmm7 ) * scalar;
11781 C(i+1UL,j+3UL) -=
sum( xmm8 ) * scalar;
11783 for( ; remainder && k<kend; ++k ) {
11784 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11785 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11786 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
11787 C(i ,j+3UL) -= A(i ,k) * B(k,j+3UL) * scalar;
11788 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11789 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11790 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
11791 C(i+1UL,j+3UL) -= A(i+1UL,k) * B(k,j+3UL) * scalar;
11794 else if( k < kend )
11805 for( ++k; k<kend; ++k ) {
11806 value1 += A(i ,k) * B(k,j );
11807 value2 += A(i ,k) * B(k,j+1UL);
11808 value3 += A(i ,k) * B(k,j+2UL);
11809 value4 += A(i ,k) * B(k,j+3UL);
11810 value5 += A(i+1UL,k) * B(k,j );
11811 value6 += A(i+1UL,k) * B(k,j+1UL);
11812 value7 += A(i+1UL,k) * B(k,j+2UL);
11813 value8 += A(i+1UL,k) * B(k,j+3UL);
11816 C(i ,j ) -= value1 * scalar;
11817 C(i ,j+1UL) -= value2 * scalar;
11818 C(i ,j+2UL) -= value3 * scalar;
11819 C(i ,j+3UL) -= value4 * scalar;
11820 C(i+1UL,j ) -= value5 * scalar;
11821 C(i+1UL,j+1UL) -= value6 * scalar;
11822 C(i+1UL,j+2UL) -= value7 * scalar;
11823 C(i+1UL,j+3UL) -= value8 * scalar;
11827 for( ; (j+2UL) <= jend; j+=2UL )
11829 const size_t kbegin( ( IsUpper_v<MT4> )
11832 const size_t kend( ( IsLower_v<MT4> )
11833 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
11834 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
11839 size_t k( kbegin );
11843 SIMDType a1( A.load(i ,k) );
11844 SIMDType a2( A.load(i+1UL,k) );
11845 SIMDType b1( B.load(k,j ) );
11846 SIMDType b2( B.load(k,j+1UL) );
11847 SIMDType xmm1( a1 * b1 );
11848 SIMDType xmm2( a1 * b2 );
11849 SIMDType xmm3( a2 * b1 );
11850 SIMDType xmm4( a2 * b2 );
11854 a2 = A.load(i+1UL,k);
11856 b2 = B.load(k,j+1UL);
11863 C(i ,j ) -=
sum( xmm1 ) * scalar;
11864 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
11865 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
11866 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
11868 for( ; remainder && k<kend; ++k ) {
11869 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
11870 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
11871 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
11872 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
11875 else if( k < kend )
11882 for( ++k; k<kend; ++k ) {
11883 value1 += A(i ,k) * B(k,j );
11884 value2 += A(i ,k) * B(k,j+1UL);
11885 value3 += A(i+1UL,k) * B(k,j );
11886 value4 += A(i+1UL,k) * B(k,j+1UL);
11889 C(i ,j ) -= value1 * scalar;
11890 C(i ,j+1UL) -= value2 * scalar;
11891 C(i+1UL,j ) -= value3 * scalar;
11892 C(i+1UL,j+1UL) -= value4 * scalar;
11898 const size_t kbegin( ( IsUpper_v<MT4> )
11901 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
11906 size_t k( kbegin );
11910 SIMDType b1( B.load(k,j) );
11911 SIMDType xmm1( A.load(i ,k) * b1 );
11912 SIMDType xmm2( A.load(i+1UL,k) * b1 );
11916 xmm1 += A.load(i ,k) * b1;
11917 xmm2 += A.load(i+1UL,k) * b1;
11920 C(i ,j) -=
sum( xmm1 ) * scalar;
11921 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
11923 for( ; remainder && k<kend; ++k ) {
11924 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
11925 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
11928 else if( k < kend )
11933 for( ++k; k<kend; ++k ) {
11934 value1 += A(i ,k) * B(k,j);
11935 value2 += A(i+1UL,k) * B(k,j);
11938 C(i ,j) -= value1 * scalar;
11939 C(i+1UL,j) -= value2 * scalar;
11946 const size_t jend( LOW ? i+1UL : N );
11947 size_t j( UPP ? i : 0UL );
11949 for( ; !( LOW && UPP ) && (j+4UL) <= jend; j+=4UL )
11951 const size_t kbegin( ( IsUpper_v<MT4> )
11954 const size_t kend( ( IsUpper_v<MT5> )?( j+4UL ):( K ) );
11959 size_t k( kbegin );
11963 SIMDType a1( A.load(i,k) );
11964 SIMDType xmm1( a1 * B.load(k,j ) );
11965 SIMDType xmm2( a1 * B.load(k,j+1UL) );
11966 SIMDType xmm3( a1 * B.load(k,j+2UL) );
11967 SIMDType xmm4( a1 * B.load(k,j+3UL) );
11971 xmm1 += a1 * B.load(k,j );
11972 xmm2 += a1 * B.load(k,j+1UL);
11973 xmm3 += a1 * B.load(k,j+2UL);
11974 xmm4 += a1 * B.load(k,j+3UL);
11977 C(i,j ) -=
sum( xmm1 ) * scalar;
11978 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
11979 C(i,j+2UL) -=
sum( xmm3 ) * scalar;
11980 C(i,j+3UL) -=
sum( xmm4 ) * scalar;
11982 for( ; remainder && k<kend; ++k ) {
11983 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
11984 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
11985 C(i,j+2UL) -= A(i,k) * B(k,j+2UL) * scalar;
11986 C(i,j+3UL) -= A(i,k) * B(k,j+3UL) * scalar;
11989 else if( k < kend )
11996 for( ++k; k<kend; ++k ) {
11997 value1 += A(i,k) * B(k,j );
11998 value2 += A(i,k) * B(k,j+1UL);
11999 value3 += A(i,k) * B(k,j+2UL);
12000 value4 += A(i,k) * B(k,j+3UL);
12003 C(i,j ) -= value1 * scalar;
12004 C(i,j+1UL) -= value2 * scalar;
12005 C(i,j+2UL) -= value3 * scalar;
12006 C(i,j+3UL) -= value4 * scalar;
12010 for( ; (j+2UL) <= jend; j+=2UL )
12012 const size_t kbegin( ( IsUpper_v<MT4> )
12015 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
12020 size_t k( kbegin );
12024 SIMDType a1( A.load(i,k) );
12025 SIMDType xmm1( a1 * B.load(k,j ) );
12026 SIMDType xmm2( a1 * B.load(k,j+1UL) );
12030 xmm1 += a1 * B.load(k,j );
12031 xmm2 += a1 * B.load(k,j+1UL);
12034 C(i,j ) -=
sum( xmm1 ) * scalar;
12035 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
12037 for( ; remainder && k<kend; ++k ) {
12038 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
12039 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
12042 else if( k < kend )
12047 for( ++k; k<kend; ++k ) {
12048 value1 += A(i,k) * B(k,j );
12049 value2 += A(i,k) * B(k,j+1UL);
12052 C(i,j ) -= value1 * scalar;
12053 C(i,j+1UL) -= value2 * scalar;
12059 const size_t kbegin( ( IsUpper_v<MT4> )
12066 size_t k( kbegin );
12070 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
12073 xmm1 += A.load(i,k) * B.load(k,j);
12076 C(i,j) -=
sum( xmm1 ) * scalar;
12078 for( ; remainder && k<K; ++k ) {
12079 C(i,j) -= A(i,k) * B(k,j) * scalar;
12086 for( ++k; k<K; ++k ) {
12087 value += A(i,k) * B(k,j);
12090 C(i,j) -= value * scalar;
12112 template<
typename MT3
12116 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12117 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12119 constexpr bool remainder( !IsPadded_v<MT4> || !IsPadded_v<MT5> );
12121 const size_t M( A.rows() );
12122 const size_t N( B.columns() );
12123 const size_t K( A.columns() );
12129 for( ; !LOW && !UPP && (i+4UL) <= M; i+=4UL )
12133 for( ; (j+2UL) <= N; j+=2UL )
12135 const size_t kbegin( ( IsUpper_v<MT4> )
12138 const size_t kend( ( IsLower_v<MT4> )
12139 ?( IsUpper_v<MT5> ?
min( i+4UL, j+2UL ) : ( i+4UL ) )
12140 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12145 size_t k( kbegin );
12149 SIMDType a1( A.load(i ,k) );
12150 SIMDType a2( A.load(i+1UL,k) );
12151 SIMDType a3( A.load(i+2UL,k) );
12152 SIMDType a4( A.load(i+3UL,k) );
12153 SIMDType b1( B.load(k,j ) );
12154 SIMDType b2( B.load(k,j+1UL) );
12155 SIMDType xmm1( a1 * b1 );
12156 SIMDType xmm2( a1 * b2 );
12157 SIMDType xmm3( a2 * b1 );
12158 SIMDType xmm4( a2 * b2 );
12159 SIMDType xmm5( a3 * b1 );
12160 SIMDType xmm6( a3 * b2 );
12161 SIMDType xmm7( a4 * b1 );
12162 SIMDType xmm8( a4 * b2 );
12167 a2 = A.load(i+1UL,k);
12168 a3 = A.load(i+2UL,k);
12169 a4 = A.load(i+3UL,k);
12171 b2 = B.load(k,j+1UL);
12182 C(i ,j ) -=
sum( xmm1 ) * scalar;
12183 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
12184 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
12185 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
12186 C(i+2UL,j ) -=
sum( xmm5 ) * scalar;
12187 C(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
12188 C(i+3UL,j ) -=
sum( xmm7 ) * scalar;
12189 C(i+3UL,j+1UL) -=
sum( xmm8 ) * scalar;
12191 for( ; remainder && k<kend; ++k ) {
12192 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12193 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12194 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12195 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12196 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12197 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12198 C(i+3UL,j ) -= A(i+3UL,k) * B(k,j ) * scalar;
12199 C(i+3UL,j+1UL) -= A(i+3UL,k) * B(k,j+1UL) * scalar;
12202 else if( k < kend )
12213 for( ++k; k<kend; ++k ) {
12214 value1 += A(i ,k) * B(k,j );
12215 value2 += A(i ,k) * B(k,j+1UL);
12216 value3 += A(i+1UL,k) * B(k,j );
12217 value4 += A(i+1UL,k) * B(k,j+1UL);
12218 value5 += A(i+2UL,k) * B(k,j );
12219 value6 += A(i+2UL,k) * B(k,j+1UL);
12220 value7 += A(i+3UL,k) * B(k,j );
12221 value8 += A(i+3UL,k) * B(k,j+1UL);
12224 C(i ,j ) -= value1 * scalar;
12225 C(i ,j+1UL) -= value2 * scalar;
12226 C(i+1UL,j ) -= value3 * scalar;
12227 C(i+1UL,j+1UL) -= value4 * scalar;
12228 C(i+2UL,j ) -= value5 * scalar;
12229 C(i+2UL,j+1UL) -= value6 * scalar;
12230 C(i+3UL,j ) -= value7 * scalar;
12231 C(i+3UL,j+1UL) -= value8 * scalar;
12237 const size_t kbegin( ( IsUpper_v<MT4> )
12240 const size_t kend( ( IsLower_v<MT4> )?( i+4UL ):( K ) );
12245 size_t k( kbegin );
12249 SIMDType b1( B.load(k,j) );
12250 SIMDType xmm1( A.load(i ,k) * b1 );
12251 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12252 SIMDType xmm3( A.load(i+2UL,k) * b1 );
12253 SIMDType xmm4( A.load(i+3UL,k) * b1 );
12257 xmm1 += A.load(i ,k) * b1;
12258 xmm2 += A.load(i+1UL,k) * b1;
12259 xmm3 += A.load(i+2UL,k) * b1;
12260 xmm4 += A.load(i+3UL,k) * b1;
12263 C(i ,j) -=
sum( xmm1 ) * scalar;
12264 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
12265 C(i+2UL,j) -=
sum( xmm3 ) * scalar;
12266 C(i+3UL,j) -=
sum( xmm4 ) * scalar;
12268 for( ; remainder && k<kend; ++k ) {
12269 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12270 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12271 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
12272 C(i+3UL,j) -= A(i+3UL,k) * B(k,j) * scalar;
12275 else if( k < kend )
12282 for( ++k; k<kend; ++k ) {
12283 value1 += A(i ,k) * B(k,j);
12284 value2 += A(i+1UL,k) * B(k,j);
12285 value3 += A(i+2UL,k) * B(k,j);
12286 value4 += A(i+3UL,k) * B(k,j);
12289 C(i ,j) -= value1 * scalar;
12290 C(i+1UL,j) -= value2 * scalar;
12291 C(i+2UL,j) -= value3 * scalar;
12292 C(i+3UL,j) -= value4 * scalar;
12297 for( ; !LOW && !UPP && (i+3UL) <= M; i+=3UL )
12301 for( ; (j+3UL) <= N; j+=3UL )
12303 const size_t kbegin( ( IsUpper_v<MT4> )
12306 const size_t kend( ( IsLower_v<MT4> )
12307 ?( IsUpper_v<MT5> ?
min( i+3UL, j+3UL ) : ( i+3UL ) )
12308 :( IsUpper_v<MT5> ? ( j+3UL ) : K ) );
12313 size_t k( kbegin );
12317 SIMDType a1( A.load(i ,k) );
12318 SIMDType a2( A.load(i+1UL,k) );
12319 SIMDType a3( A.load(i+2UL,k) );
12320 SIMDType b1( B.load(k,j ) );
12321 SIMDType b2( B.load(k,j+1UL) );
12322 SIMDType b3( B.load(k,j+2UL) );
12323 SIMDType xmm1( a1 * b1 );
12324 SIMDType xmm2( a1 * b2 );
12325 SIMDType xmm3( a1 * b3 );
12326 SIMDType xmm4( a2 * b1 );
12327 SIMDType xmm5( a2 * b2 );
12328 SIMDType xmm6( a2 * b3 );
12329 SIMDType xmm7( a3 * b1 );
12330 SIMDType xmm8( a3 * b2 );
12331 SIMDType xmm9( a3 * b3 );
12336 a2 = A.load(i+1UL,k);
12337 a3 = A.load(i+2UL,k);
12339 b2 = B.load(k,j+1UL);
12340 b3 = B.load(k,j+2UL);
12352 C(i ,j ) -=
sum( xmm1 ) * scalar;
12353 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
12354 C(i ,j+2UL) -=
sum( xmm3 ) * scalar;
12355 C(i+1UL,j ) -=
sum( xmm4 ) * scalar;
12356 C(i+1UL,j+1UL) -=
sum( xmm5 ) * scalar;
12357 C(i+1UL,j+2UL) -=
sum( xmm6 ) * scalar;
12358 C(i+2UL,j ) -=
sum( xmm7 ) * scalar;
12359 C(i+2UL,j+1UL) -=
sum( xmm8 ) * scalar;
12360 C(i+2UL,j+2UL) -=
sum( xmm9 ) * scalar;
12362 for( ; remainder && k<kend; ++k ) {
12363 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12364 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12365 C(i ,j+2UL) -= A(i ,k) * B(k,j+2UL) * scalar;
12366 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12367 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12368 C(i+1UL,j+2UL) -= A(i+1UL,k) * B(k,j+2UL) * scalar;
12369 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12370 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12371 C(i+2UL,j+2UL) -= A(i+2UL,k) * B(k,j+2UL) * scalar;
12374 else if( k < kend )
12386 for( ++k; k<kend; ++k ) {
12387 value1 += A(i ,k) * B(k,j );
12388 value2 += A(i ,k) * B(k,j+1UL);
12389 value3 += A(i ,k) * B(k,j+2UL);
12390 value4 += A(i+1UL,k) * B(k,j );
12391 value5 += A(i+1UL,k) * B(k,j+1UL);
12392 value6 += A(i+1UL,k) * B(k,j+2UL);
12393 value7 += A(i+2UL,k) * B(k,j );
12394 value8 += A(i+2UL,k) * B(k,j+1UL);
12395 value9 += A(i+2UL,k) * B(k,j+2UL);
12398 C(i ,j ) -= value1 * scalar;
12399 C(i ,j+1UL) -= value2 * scalar;
12400 C(i ,j+2UL) -= value3 * scalar;
12401 C(i+1UL,j ) -= value4 * scalar;
12402 C(i+1UL,j+1UL) -= value5 * scalar;
12403 C(i+1UL,j+2UL) -= value6 * scalar;
12404 C(i+2UL,j ) -= value7 * scalar;
12405 C(i+2UL,j+1UL) -= value8 * scalar;
12406 C(i+2UL,j+2UL) -= value9 * scalar;
12410 for( ; (j+2UL) <= N; j+=2UL )
12412 const size_t kbegin( ( IsUpper_v<MT4> )
12415 const size_t kend( ( IsLower_v<MT4> )
12416 ?( IsUpper_v<MT5> ?
min( i+3UL, j+2UL ) : ( i+3UL ) )
12417 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12422 size_t k( kbegin );
12426 SIMDType a1( A.load(i ,k) );
12427 SIMDType a2( A.load(i+1UL,k) );
12428 SIMDType a3( A.load(i+2UL,k) );
12429 SIMDType b1( B.load(k,j ) );
12430 SIMDType b2( B.load(k,j+1UL) );
12431 SIMDType xmm1( a1 * b1 );
12432 SIMDType xmm2( a1 * b2 );
12433 SIMDType xmm3( a2 * b1 );
12434 SIMDType xmm4( a2 * b2 );
12435 SIMDType xmm5( a3 * b1 );
12436 SIMDType xmm6( a3 * b2 );
12441 a2 = A.load(i+1UL,k);
12442 a3 = A.load(i+2UL,k);
12444 b2 = B.load(k,j+1UL);
12453 C(i ,j ) -=
sum( xmm1 ) * scalar;
12454 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
12455 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
12456 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
12457 C(i+2UL,j ) -=
sum( xmm5 ) * scalar;
12458 C(i+2UL,j+1UL) -=
sum( xmm6 ) * scalar;
12460 for( ; remainder && k<kend; ++k ) {
12461 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12462 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12463 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12464 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12465 C(i+2UL,j ) -= A(i+2UL,k) * B(k,j ) * scalar;
12466 C(i+2UL,j+1UL) -= A(i+2UL,k) * B(k,j+1UL) * scalar;
12469 else if( k < kend )
12478 for( ++k; k<kend; ++k ) {
12479 value1 += A(i ,k) * B(k,j );
12480 value2 += A(i ,k) * B(k,j+1UL);
12481 value3 += A(i+1UL,k) * B(k,j );
12482 value4 += A(i+1UL,k) * B(k,j+1UL);
12483 value5 += A(i+2UL,k) * B(k,j );
12484 value6 += A(i+2UL,k) * B(k,j+1UL);
12487 C(i ,j ) -= value1 * scalar;
12488 C(i ,j+1UL) -= value2 * scalar;
12489 C(i+1UL,j ) -= value3 * scalar;
12490 C(i+1UL,j+1UL) -= value4 * scalar;
12491 C(i+2UL,j ) -= value5 * scalar;
12492 C(i+2UL,j+1UL) -= value6 * scalar;
12498 const size_t kbegin( ( IsUpper_v<MT4> )
12501 const size_t kend( ( IsLower_v<MT4> )?( i+3UL ):( K ) );
12506 size_t k( kbegin );
12510 SIMDType b1( B.load(k,j) );
12511 SIMDType xmm1( A.load(i ,k) * b1 );
12512 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12513 SIMDType xmm3( A.load(i+2UL,k) * b1 );
12517 xmm1 += A.load(i ,k) * b1;
12518 xmm2 += A.load(i+1UL,k) * b1;
12519 xmm3 += A.load(i+2UL,k) * b1;
12522 C(i ,j) -=
sum( xmm1 ) * scalar;
12523 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
12524 C(i+2UL,j) -=
sum( xmm3 ) * scalar;
12526 for( ; remainder && k<kend; ++k ) {
12527 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12528 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12529 C(i+2UL,j) -= A(i+2UL,k) * B(k,j) * scalar;
12532 else if( k < kend )
12538 for( ++k; k<kend; ++k ) {
12539 value1 += A(i ,k) * B(k,j);
12540 value2 += A(i+1UL,k) * B(k,j);
12541 value3 += A(i+2UL,k) * B(k,j);
12544 C(i ,j) -= value1 * scalar;
12545 C(i+1UL,j) -= value2 * scalar;
12546 C(i+2UL,j) -= value3 * scalar;
12551 for( ; (i+2UL) <= M; i+=2UL )
12553 const size_t jend( LOW ? i+2UL : N );
12554 size_t j( UPP ? i : 0UL );
12556 for( ; (j+2UL) <= jend; j+=2UL )
12558 const size_t kbegin( ( IsUpper_v<MT4> )
12561 const size_t kend( ( IsLower_v<MT4> )
12562 ?( IsUpper_v<MT5> ?
min( i+2UL, j+2UL ) : ( i+2UL ) )
12563 :( IsUpper_v<MT5> ? ( j+2UL ) : K ) );
12568 size_t k( kbegin );
12572 SIMDType a1( A.load(i ,k) );
12573 SIMDType a2( A.load(i+1UL,k) );
12574 SIMDType b1( B.load(k,j ) );
12575 SIMDType b2( B.load(k,j+1UL) );
12576 SIMDType xmm1( a1 * b1 );
12577 SIMDType xmm2( a1 * b2 );
12578 SIMDType xmm3( a2 * b1 );
12579 SIMDType xmm4( a2 * b2 );
12583 a2 = A.load(i+1UL,k);
12585 b2 = B.load(k,j+1UL);
12592 C(i ,j ) -=
sum( xmm1 ) * scalar;
12593 C(i ,j+1UL) -=
sum( xmm2 ) * scalar;
12594 C(i+1UL,j ) -=
sum( xmm3 ) * scalar;
12595 C(i+1UL,j+1UL) -=
sum( xmm4 ) * scalar;
12597 for( ; remainder && k<kend; ++k ) {
12598 C(i ,j ) -= A(i ,k) * B(k,j ) * scalar;
12599 C(i ,j+1UL) -= A(i ,k) * B(k,j+1UL) * scalar;
12600 C(i+1UL,j ) -= A(i+1UL,k) * B(k,j ) * scalar;
12601 C(i+1UL,j+1UL) -= A(i+1UL,k) * B(k,j+1UL) * scalar;
12604 else if( k < kend )
12611 for( ++k; k<kend; ++k ) {
12612 value1 += A(i ,k) * B(k,j );
12613 value2 += A(i ,k) * B(k,j+1UL);
12614 value3 += A(i+1UL,k) * B(k,j );
12615 value4 += A(i+1UL,k) * B(k,j+1UL);
12618 C(i ,j ) -= value1 * scalar;
12619 C(i ,j+1UL) -= value2 * scalar;
12620 C(i+1UL,j ) -= value3 * scalar;
12621 C(i+1UL,j+1UL) -= value4 * scalar;
12627 const size_t kbegin( ( IsUpper_v<MT4> )
12630 const size_t kend( ( IsLower_v<MT4> )?( i+2UL ):( K ) );
12635 size_t k( kbegin );
12639 SIMDType b1( B.load(k,j) );
12640 SIMDType xmm1( A.load(i ,k) * b1 );
12641 SIMDType xmm2( A.load(i+1UL,k) * b1 );
12645 xmm1 += A.load(i ,k) * b1;
12646 xmm2 += A.load(i+1UL,k) * b1;
12649 C(i ,j) -=
sum( xmm1 ) * scalar;
12650 C(i+1UL,j) -=
sum( xmm2 ) * scalar;
12652 for( ; remainder && k<kend; ++k ) {
12653 C(i ,j) -= A(i ,k) * B(k,j) * scalar;
12654 C(i+1UL,j) -= A(i+1UL,k) * B(k,j) * scalar;
12657 else if( k < kend )
12662 for( ++k; k<kend; ++k ) {
12663 value1 += A(i ,k) * B(k,j);
12664 value2 += A(i+1UL,k) * B(k,j);
12667 C(i ,j) -= value1 * scalar;
12668 C(i+1UL,j) -= value2 * scalar;
12675 const size_t jend( LOW ? i+1UL : N );
12676 size_t j( UPP ? i : 0UL );
12678 for( ; (j+2UL) <= jend; j+=2UL )
12680 const size_t kbegin( ( IsUpper_v<MT4> )
12683 const size_t kend( ( IsUpper_v<MT5> )?( j+2UL ):( K ) );
12688 size_t k( kbegin );
12692 SIMDType a1( A.load(i,k) );
12693 SIMDType xmm1( a1 * B.load(k,j ) );
12694 SIMDType xmm2( a1 * B.load(k,j+1UL) );
12698 xmm1 += a1 * B.load(k,j );
12699 xmm2 += a1 * B.load(k,j+1UL);
12702 C(i,j ) -=
sum( xmm1 ) * scalar;
12703 C(i,j+1UL) -=
sum( xmm2 ) * scalar;
12705 for( ; remainder && k<kend; ++k ) {
12706 C(i,j ) -= A(i,k) * B(k,j ) * scalar;
12707 C(i,j+1UL) -= A(i,k) * B(k,j+1UL) * scalar;
12710 else if( k < kend )
12715 for( ++k; k<kend; ++k ) {
12716 value1 += A(i,k) * B(k,j );
12717 value2 += A(i,k) * B(k,j+1UL);
12720 C(i,j ) -= value1 * scalar;
12721 C(i,j+1UL) -= value2 * scalar;
12727 const size_t kbegin( ( IsUpper_v<MT4> )
12734 size_t k( kbegin );
12738 SIMDType xmm1( A.load(i,k) * B.load(k,j) );
12741 xmm1 += A.load(i,k) * B.load(k,j);
12744 C(i,j) -=
sum( xmm1 ) * scalar;
12746 for( ; remainder && k<K; ++k ) {
12747 C(i,j) -= A(i,k) * B(k,j) * scalar;
12754 for( ++k; k<K; ++k ) {
12755 value += A(i,k) * B(k,j);
12758 C(i,j) -= value * scalar;
12779 template<
typename MT3
12783 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12784 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12786 selectDefaultSubAssignKernel( C, A, B, scalar );
12805 template<
typename MT3
12809 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12810 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
12813 lmmm( C, A, B, -scalar, ST2(1) );
12815 ummm( C, A, B, -scalar, ST2(1) );
12817 mmm( C, A, B, -scalar, ST2(1) );
12835 template<
typename MT3
12839 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12840 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12842 selectLargeSubAssignKernel( C, A, B, scalar );
12847#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
12861 template<
typename MT3
12865 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
12866 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
12868 using ET = ElementType_t<MT3>;
12870 if( IsTriangular_v<MT4> ) {
12871 ResultType_t<MT3> tmp(
serial( B ) );
12872 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12873 subAssign( C, tmp );
12875 else if( IsTriangular_v<MT5> ) {
12876 ResultType_t<MT3> tmp(
serial( A ) );
12877 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
12878 subAssign( C, tmp );
12881 gemm( C, A, B,
ET(-scalar),
ET(1) );
12903 template<
typename MT
12905 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
12917 schurAssign( *lhs, tmp );
12948 template<
typename MT
12951 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
12958 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
12959 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
12961 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
12964 else if( left.columns() == 0UL ) {
12979 smpAssign( *lhs, A * B * rhs.scalar_ );
12998 template<
typename MT
13001 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13005 using TmpType = If_t< SO, OppositeType, ResultType >;
13017 const ForwardFunctor fwd;
13019 const TmpType tmp( rhs );
13039 template<
typename MT
13042 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13049 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13050 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13052 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
13089 template<
typename MT
13092 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
13099 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
13100 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
13102 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
13136 template<
typename MT
13219template<
typename MT1
13221inline decltype(
auto)
13226 if( (*lhs).columns() != (*rhs).rows() ) {
13231 return ReturnType( *lhs, *rhs );
13269template<
typename MT1
13275inline decltype(
auto)
declsym(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13283 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
13284 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13315template<
typename MT1
13321inline decltype(
auto)
declherm(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13329 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
13330 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13361template<
typename MT1
13367inline decltype(
auto)
decllow(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13375 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
13376 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13407template<
typename MT1
13412inline decltype(
auto)
declunilow(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
13451template<
typename MT1
13456inline decltype(
auto)
declstrlow(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
13495template<
typename MT1
13501inline decltype(
auto)
declupp(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13509 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
13510 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13541template<
typename MT1
13546inline decltype(
auto)
decluniupp(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
13585template<
typename MT1
13590inline decltype(
auto)
declstrupp(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
13629template<
typename MT1
13635inline decltype(
auto)
decldiag(
const DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
13643 using ReturnType =
const DMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
13644 return ReturnType( dm.leftOperand(), dm.rightOperand() );
13660template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13661struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
13662 :
public Size<MT1,0UL>
13665template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13666struct Size< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
13667 :
public Size<MT2,1UL>
13683template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
13684struct IsAligned< DMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
13685 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Expression object for dense matrix-transpose dense matrix multiplications.
Definition: DMatTDMatMultExpr.h:146
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatTDMatMultExpr.h:443
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:159
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: DMatTDMatMultExpr.h:419
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:152
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatTDMatMultExpr.h:270
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:409
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatTDMatMultExpr.h:268
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:282
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatTDMatMultExpr.h:453
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:154
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:151
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:269
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:476
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatTDMatMultExpr.h:431
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatTDMatMultExpr.h:399
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:149
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:266
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:153
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatTDMatMultExpr.h:265
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatTDMatMultExpr.h:324
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatTDMatMultExpr.h:389
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:273
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatTDMatMultExpr.h:373
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatTDMatMultExpr.h:279
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatTDMatMultExpr.h:169
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatTDMatMultExpr.h:267
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:164
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:150
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatTDMatMultExpr.h:170
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatTDMatMultExpr.h:287
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatTDMatMultExpr.h:475
DMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatTDMatMultExpr class.
Definition: DMatTDMatMultExpr.h:309
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatTDMatMultExpr.h:263
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatTDMatMultExpr.h:294
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatTDMatMultExpr.h:168
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatTDMatMultExpr.h:300
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatTDMatMultExpr.h:276
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatTDMatMultExpr.h:171
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatTDMatMultExpr.h:463
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.