35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATTDMATMULTEXPR_H_
143template<
typename MT1
150 :
public MatMatMultExpr< DenseMatrix< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, true > >
165 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
170 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
174 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
175 static constexpr bool HERM = ( HF && !( LF || UF ) );
176 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
177 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
187 template<
typename T1,
typename T2,
typename T3 >
188 static constexpr bool CanExploitSymmetry_v =
189 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
199 template<
typename T1,
typename T2,
typename T3 >
200 static constexpr bool IsEvaluationRequired_v =
210 template<
typename T1,
typename T2,
typename T3 >
211 static constexpr bool UseBlasKernel_v =
214 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
215 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
216 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
217 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
218 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
219 IsBLASCompatible_v< ElementType_t<T1> > &&
220 IsBLASCompatible_v< ElementType_t<T2> > &&
221 IsBLASCompatible_v< ElementType_t<T3> > &&
232 template<
typename T1,
typename T2,
typename T3 >
233 static constexpr bool UseVectorizedDefaultKernel_v =
234 ( useOptimizedKernels &&
235 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
236 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
237 IsSIMDCombinable_v< ElementType_t<T1>
308 ( !IsDiagonal_v<MT1> &&
309 MT1::simdEnabled && MT2::simdEnabled &&
310 HasSIMDAdd_v<ET1,ET2> &&
311 HasSIMDMult_v<ET1,ET2> );
348 if( IsDiagonal_v<MT1> ) {
351 else if( IsDiagonal_v<MT2> ) {
354 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
355 const size_t begin( ( IsUpper_v<MT1> )
356 ?( ( IsLower_v<MT2> )
357 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
358 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
359 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
360 :( ( IsLower_v<MT2> )
361 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
363 const size_t end( ( IsLower_v<MT1> )
364 ?( ( IsUpper_v<MT2> )
365 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
366 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
367 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
368 :( ( IsUpper_v<MT2> )
369 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
370 :(
lhs_.columns() ) ) );
394 if( i >=
lhs_.rows() ) {
397 if( j >=
rhs_.columns() ) {
409 inline size_t rows() const noexcept {
420 return rhs_.columns();
450 template<
typename T >
451 inline bool canAlias(
const T* alias )
const noexcept {
452 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
462 template<
typename T >
463 inline bool isAliased(
const T* alias )
const noexcept {
464 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
474 return lhs_.isAligned() &&
rhs_.isAligned();
485 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
487 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
488 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD ) &&
489 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
512 template<
typename MT
522 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
525 else if( rhs.lhs_.columns() == 0UL ) {
540 TDMatTDMatMultExpr::selectAssignKernel( *lhs, A, B );
556 template<
typename MT3
559 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
561 if( ( IsDiagonal_v<MT4> ) ||
562 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
563 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
564 selectSmallAssignKernel( C, A, B );
566 selectBlasAssignKernel( C, A, B );
585 template<
typename MT3
588 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
589 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
591 const size_t M( A.rows() );
592 const size_t N( B.columns() );
593 const size_t K( A.columns() );
597 for(
size_t j=0UL; j<N; ++j )
599 const size_t kbegin( ( IsLower_v<MT5> )
600 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
602 const size_t kend( ( IsUpper_v<MT5> )
603 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
607 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
608 for(
size_t i=0UL; i<M; ++i ) {
615 const size_t ibegin( ( IsLower_v<MT4> )
616 ?( ( IsStrictlyLower_v<MT4> )
617 ?(
LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
618 :(
LOW ?
max(j,kbegin) : kbegin ) )
619 :(
LOW ? j : 0UL ) );
620 const size_t iend( ( IsUpper_v<MT4> )
621 ?( ( IsStrictlyUpper_v<MT4> )
622 ?(
UPP ?
min(j+1UL,kbegin) : kbegin )
623 :(
UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
624 :(
UPP ? j+1UL : M ) );
626 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
627 for(
size_t i=0UL; i<ibegin; ++i ) {
631 else if( IsStrictlyLower_v<MT4> ) {
634 for(
size_t i=ibegin; i<iend; ++i ) {
635 C(i,j) = A(i,kbegin) * B(kbegin,j);
637 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
638 for(
size_t i=iend; i<M; ++i ) {
642 else if( IsStrictlyUpper_v<MT4> ) {
647 for(
size_t k=kbegin+1UL; k<kend; ++k )
649 const size_t ibegin( ( IsLower_v<MT4> )
650 ?( ( IsStrictlyLower_v<MT4> )
654 const size_t iend( ( IsUpper_v<MT4> )
655 ?( ( IsStrictlyUpper_v<MT4> )
656 ?(
UPP ?
min(j+1UL,k-1UL) : k-1UL )
657 :(
UPP ?
min(j+1UL,k) : k ) )
658 :(
UPP ? j+1UL : M ) );
660 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( ibegin > iend ) )
continue;
663 for(
size_t i=ibegin; i<iend; ++i ) {
664 C(i,j) += A(i,k) * B(k,j);
666 if( IsUpper_v<MT4> ) {
667 C(iend,j) = A(iend,k) * B(k,j);
673 for(
size_t j=1UL; j<N; ++j ) {
674 for(
size_t i=0UL; i<j; ++i ) {
675 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
697 template<
typename MT3
700 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
701 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
705 const size_t M( A.rows() );
706 const size_t N( B.columns() );
708 for(
size_t j=0UL; j<N; ++j )
710 const size_t ibegin( ( IsLower_v<MT4> )
711 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
713 const size_t iend( ( IsUpper_v<MT4> )
714 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
718 if( IsLower_v<MT4> ) {
719 for(
size_t i=0UL; i<ibegin; ++i ) {
723 for(
size_t i=ibegin; i<iend; ++i ) {
724 C(i,j) = A(i,j) * B(j,j);
726 if( IsUpper_v<MT4> ) {
727 for(
size_t i=iend; i<M; ++i ) {
750 template<
typename MT3
753 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
754 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
758 const size_t M( A.rows() );
759 const size_t N( B.columns() );
761 for(
size_t j=0UL; j<N; ++j )
763 const size_t ibegin( ( IsLower_v<MT5> )
764 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
766 const size_t iend( ( IsUpper_v<MT5> )
767 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
771 if( IsLower_v<MT4> ) {
772 for(
size_t i=0UL; i<ibegin; ++i ) {
776 for(
size_t i=ibegin; i<iend; ++i ) {
777 C(i,j) = A(i,i) * B(i,j);
779 if( IsUpper_v<MT4> ) {
780 for(
size_t i=iend; i<M; ++i ) {
803 template<
typename MT3
806 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
807 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
813 for(
size_t i=0UL; i<A.rows(); ++i ) {
814 C(i,i) = A(i,i) * B(i,i);
834 template<
typename MT3
837 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
838 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
840 selectDefaultAssignKernel( C, A, B );
860 template<
typename MT3
863 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
864 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
871 const ForwardFunctor fwd;
873 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
874 const OppositeType_t<MT5> tmp(
serial( B ) );
875 assign( C, fwd( A * tmp ) );
877 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
878 const OppositeType_t<MT4> tmp(
serial( A ) );
879 assign( C, fwd( tmp * B ) );
881 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
882 const OppositeType_t<MT5> tmp(
serial( B ) );
883 assign( C, fwd( A * tmp ) );
886 const OppositeType_t<MT4> tmp(
serial( A ) );
887 assign( C, fwd( tmp * B ) );
908 template<
typename MT3
911 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
912 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
914 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
916 const size_t M( A.rows() );
917 const size_t N( B.columns() );
918 const size_t K( A.columns() );
927 if( IsIntegral_v<ElementType> )
930 for(
size_t j=0UL; j<N; ++j )
932 const size_t kbegin( ( IsLower_v<MT5> )
933 ?( ( IsUpper_v<MT4> )
934 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
935 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
936 :( IsUpper_v<MT4> ? i : 0UL ) );
937 const size_t kend( ( IsUpper_v<MT5> )
938 ?( ( IsLower_v<MT4> )
939 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
940 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
941 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
957 for( ++k; k<kend; ++k ) {
959 xmm1 += A.load(i ,k) * b1;
961 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
962 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
963 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
964 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
965 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
966 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
969 C.store( i , j, xmm1 );
981 C.store( i , j,
zero );
998 for( ; (j+2UL) <= N; j+=2UL )
1000 const size_t kbegin( ( IsLower_v<MT5> )
1001 ?( ( IsUpper_v<MT4> )
1002 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1003 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1004 :( IsUpper_v<MT4> ? i : 0UL ) );
1005 const size_t kend( ( IsUpper_v<MT5> )
1006 ?( ( IsLower_v<MT4> )
1007 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1008 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1009 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
1033 for( ++k; k<kend; ++k ) {
1039 b1 =
set( B(k,j ) );
1040 b2 =
set( B(k,j+1UL) );
1053 C.store( i , j , xmm1 );
1055 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1056 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1057 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
1058 C.store( i , j+1UL, xmm6 );
1059 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
1060 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
1061 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
1062 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
1067 C.store( i , j ,
zero );
1072 C.store( i , j+1UL,
zero );
1082 const size_t kbegin( ( IsLower_v<MT5> )
1083 ?( ( IsUpper_v<MT4> )
1084 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1085 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1086 :( IsUpper_v<MT4> ? i : 0UL ) );
1087 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
1094 SIMDType xmm1( A.load(i ,k) * b1 );
1100 for( ++k; k<kend; ++k ) {
1102 xmm1 += A.load(i ,k) * b1;
1103 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1104 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1105 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1106 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
1109 C.store( i , j, xmm1 );
1111 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1112 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1113 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
1118 C.store( i , j,
zero );
1135 for(
size_t ii=i; ii<iiend; ++ii ) {
1136 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1143 for(
size_t ii=i; ii<iiend; ++ii ) {
1149 for( ; (j+2UL) <= jend; j+=2UL )
1151 const size_t kbegin( ( IsLower_v<MT5> )
1152 ?( ( IsUpper_v<MT4> )
1153 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1154 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1155 :( IsUpper_v<MT4> ? i : 0UL ) );
1156 const size_t kend( ( IsUpper_v<MT5> )
1157 ?( ( IsLower_v<MT4> )
1158 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1159 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1160 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
1181 for( ++k; k<kend; ++k ) {
1186 b1 =
set( B(k,j ) );
1187 b2 =
set( B(k,j+1UL) );
1198 C.store( i , j , xmm1 );
1200 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1201 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
1202 C.store( i , j+1UL, xmm5 );
1203 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
1204 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
1205 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
1210 C.store( i , j ,
zero );
1214 C.store( i , j+1UL,
zero );
1223 const size_t kbegin( ( IsLower_v<MT5> )
1224 ?( ( IsUpper_v<MT4> )
1225 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1226 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1227 :( IsUpper_v<MT4> ? i : 0UL ) );
1228 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
1235 SIMDType xmm1( A.load(i ,k) * b1 );
1240 for( ++k; k<kend; ++k ) {
1242 xmm1 += A.load(i ,k) * b1;
1243 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1244 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1245 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
1248 C.store( i , j, xmm1 );
1250 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1251 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
1256 C.store( i , j,
zero );
1268 for(
size_t ii=i; ii<iiend; ++ii ) {
1283 for(
size_t ii=i; ii<iiend; ++ii ) {
1284 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1291 for(
size_t ii=i; ii<iiend; ++ii ) {
1297 for( ; (j+2UL) <= jend; j+=2UL )
1299 const size_t kbegin( ( IsLower_v<MT5> )
1300 ?( ( IsUpper_v<MT4> )
1301 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1302 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1303 :( IsUpper_v<MT4> ? i : 0UL ) );
1304 const size_t kend( ( IsUpper_v<MT5> )
1305 ?( ( IsLower_v<MT4> )
1306 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1307 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1308 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
1326 for( ++k; k<kend; ++k ) {
1330 b1 =
set( B(k,j ) );
1331 b2 =
set( B(k,j+1UL) );
1340 C.store( i , j , xmm1 );
1342 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
1343 C.store( i , j+1UL, xmm4 );
1344 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
1345 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
1350 C.store( i , j ,
zero );
1353 C.store( i , j+1UL,
zero );
1361 const size_t kbegin( ( IsLower_v<MT5> )
1362 ?( ( IsUpper_v<MT4> )
1363 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1364 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1365 :( IsUpper_v<MT4> ? i : 0UL ) );
1366 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
1373 SIMDType xmm1( A.load(i ,k) * b1 );
1377 for( ++k; k<kend; ++k ) {
1379 xmm1 += A.load(i ,k) * b1;
1380 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
1381 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
1384 C.store( i , j, xmm1 );
1386 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
1391 C.store( i , j,
zero );
1402 for(
size_t ii=i; ii<iiend; ++ii ) {
1417 for(
size_t ii=i; ii<iiend; ++ii ) {
1418 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1425 for(
size_t ii=i; ii<iiend; ++ii ) {
1431 for( ; (j+4UL) <= jend; j+=4UL )
1433 const size_t kbegin( ( IsLower_v<MT5> )
1434 ?( ( IsUpper_v<MT4> )
1435 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1436 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1437 :( IsUpper_v<MT4> ? i : 0UL ) );
1438 const size_t kend( ( IsUpper_v<MT5> )
1439 ?( ( IsLower_v<MT4> )
1440 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
1441 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
1442 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1463 for( ++k; k<kend; ++k ) {
1466 b1 =
set( B(k,j ) );
1467 b2 =
set( B(k,j+1UL) );
1468 b3 =
set( B(k,j+2UL) );
1469 b4 =
set( B(k,j+3UL) );
1480 C.store( i , j , xmm1 );
1482 C.store( i , j+1UL, xmm3 );
1483 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1484 C.store( i , j+2UL, xmm5 );
1485 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1486 C.store( i , j+3UL, xmm7 );
1487 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
1492 C.store( i , j ,
zero );
1494 C.store( i , j+1UL,
zero );
1496 C.store( i , j+2UL,
zero );
1498 C.store( i , j+3UL,
zero );
1503 for( ; (j+3UL) <= jend; j+=3UL )
1505 const size_t kbegin( ( IsLower_v<MT5> )
1506 ?( ( IsUpper_v<MT4> )
1507 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1508 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1509 :( IsUpper_v<MT4> ? i : 0UL ) );
1510 const size_t kend( ( IsUpper_v<MT5> )
1511 ?( ( IsLower_v<MT4> )
1512 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
1513 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
1514 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1532 for( ++k; k<kend; ++k ) {
1535 b1 =
set( B(k,j ) );
1536 b2 =
set( B(k,j+1UL) );
1537 b3 =
set( B(k,j+2UL) );
1546 C.store( i , j , xmm1 );
1548 C.store( i , j+1UL, xmm3 );
1549 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1550 C.store( i , j+2UL, xmm5 );
1551 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
1556 C.store( i , j ,
zero );
1558 C.store( i , j+1UL,
zero );
1560 C.store( i , j+2UL,
zero );
1565 for( ; (j+2UL) <= jend; j+=2UL )
1567 const size_t kbegin( ( IsLower_v<MT5> )
1568 ?( ( IsUpper_v<MT4> )
1569 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1570 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1571 :( IsUpper_v<MT4> ? i : 0UL ) );
1572 const size_t kend( ( IsUpper_v<MT5> )
1573 ?( ( IsLower_v<MT4> )
1574 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
1575 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
1576 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
1591 for( ++k; k<kend; ++k ) {
1594 b1 =
set( B(k,j ) );
1595 b2 =
set( B(k,j+1UL) );
1602 C.store( i , j , xmm1 );
1604 C.store( i , j+1UL, xmm3 );
1605 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
1610 C.store( i , j ,
zero );
1612 C.store( i , j+1UL,
zero );
1619 const size_t kbegin( ( IsLower_v<MT5> )
1620 ?( ( IsUpper_v<MT4> )
1621 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1622 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1623 :( IsUpper_v<MT4> ? i : 0UL ) );
1624 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
1631 SIMDType xmm1( A.load(i ,k) * b1 );
1634 for( ++k; k<kend; ++k ) {
1636 xmm1 += A.load(i ,k) * b1;
1640 C.store( i , j, xmm1 );
1646 C.store( i , j,
zero );
1656 for(
size_t ii=i; ii<iiend; ++ii ) {
1671 for(
size_t ii=i; ii<iiend; ++ii ) {
1672 C(ii,j) =
HERM ?
conj( C(j,ii) ) : C(j,ii);
1679 for(
size_t ii=i; ii<iiend; ++ii ) {
1685 for( ; (j+4UL) <= jend; j+=4UL )
1687 const size_t kbegin( ( IsLower_v<MT5> )
1688 ?( ( IsUpper_v<MT4> )
1689 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1690 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1691 :( IsUpper_v<MT4> ? i : 0UL ) );
1692 const size_t kend( ( IsUpper_v<MT5> )
1693 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
1706 for( ++k; k<kend; ++k ) {
1708 xmm1 += a1 *
set( B(k,j ) );
1709 xmm2 += a1 *
set( B(k,j+1UL) );
1710 xmm3 += a1 *
set( B(k,j+2UL) );
1711 xmm4 += a1 *
set( B(k,j+3UL) );
1714 C.store( i, j , xmm1 );
1715 C.store( i, j+1UL, xmm2 );
1716 C.store( i, j+2UL, xmm3 );
1717 C.store( i, j+3UL, xmm4 );
1722 C.store( i, j ,
zero );
1723 C.store( i, j+1UL,
zero );
1724 C.store( i, j+2UL,
zero );
1725 C.store( i, j+3UL,
zero );
1729 for( ; (j+3UL) <= jend; j+=3UL )
1731 const size_t kbegin( ( IsLower_v<MT5> )
1732 ?( ( IsUpper_v<MT4> )
1733 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1734 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1735 :( IsUpper_v<MT4> ? i : 0UL ) );
1736 const size_t kend( ( IsUpper_v<MT5> )
1737 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
1749 for( ++k; k<kend; ++k ) {
1751 xmm1 += a1 *
set( B(k,j ) );
1752 xmm2 += a1 *
set( B(k,j+1UL) );
1753 xmm3 += a1 *
set( B(k,j+2UL) );
1756 C.store( i, j , xmm1 );
1757 C.store( i, j+1UL, xmm2 );
1758 C.store( i, j+2UL, xmm3 );
1763 C.store( i, j ,
zero );
1764 C.store( i, j+1UL,
zero );
1765 C.store( i, j+2UL,
zero );
1769 for( ; (j+2UL) <= jend; j+=2UL )
1771 const size_t kbegin( ( IsLower_v<MT5> )
1772 ?( ( IsUpper_v<MT4> )
1773 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1774 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1775 :( IsUpper_v<MT4> ? i : 0UL ) );
1776 const size_t kend( ( IsUpper_v<MT5> )
1777 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1788 for( ++k; k<kend; ++k ) {
1790 xmm1 += a1 *
set( B(k,j ) );
1791 xmm2 += a1 *
set( B(k,j+1UL) );
1794 C.store( i, j , xmm1 );
1795 C.store( i, j+1UL, xmm2 );
1800 C.store( i, j ,
zero );
1801 C.store( i, j+1UL,
zero );
1807 const size_t kbegin( ( IsLower_v<MT5> )
1808 ?( ( IsUpper_v<MT4> )
1809 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1810 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1811 :( IsUpper_v<MT4> ? i : 0UL ) );
1819 for( ++k; k<K; ++k ) {
1820 xmm1 += A.load(i,k) *
set( B(k,j) );
1823 C.store( i, j, xmm1 );
1828 C.store( i, j,
zero );
1837 for(
size_t ii=i; ii<iiend; ++ii ) {
1844 for( ; remainder && i<M; ++i )
1850 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1859 for( ; (j+2UL) <= N; j+=2UL )
1861 const size_t kbegin( ( IsLower_v<MT5> )
1862 ?( ( IsUpper_v<MT4> )
1863 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1864 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1865 :( IsUpper_v<MT4> ? i : 0UL ) );
1866 const size_t kend( ( IsUpper_v<MT5> )
1867 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
1877 for( ++k; k<kend; ++k ) {
1878 value1 += A(i,k) * B(k,j );
1879 value2 += A(i,k) * B(k,j+1UL);
1883 C(i,j+1UL) = value2;
1888 reset( C(i,j+1UL) );
1894 const size_t kbegin( ( IsLower_v<MT5> )
1895 ?( ( IsUpper_v<MT4> )
1896 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
1897 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
1898 :( IsUpper_v<MT4> ? i : 0UL ) );
1906 for( ++k; k<K; ++k ) {
1907 value += A(i,k) * B(k,j);
1936 template<
typename MT3
1939 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1940 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1942 selectDefaultAssignKernel( C, A, B );
1962 template<
typename MT3
1965 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1966 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1996 template<
typename MT3
1999 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2000 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2002 selectLargeAssignKernel( C, A, B );
2008#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2022 template<
typename MT3
2025 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2026 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2028 using ET = ElementType_t<MT3>;
2030 if( IsTriangular_v<MT4> ) {
2032 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2034 else if( IsTriangular_v<MT5> ) {
2036 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2039 gemm( C, A, B, ET(1), ET(0) );
2059 template<
typename MT
2062 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2066 using TmpType = If_t< SO, ResultType, OppositeType >;
2078 const ForwardFunctor fwd;
2080 const TmpType tmp(
serial( rhs ) );
2081 assign( *lhs, fwd( tmp ) );
2101 template<
typename MT >
2103 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2112 const ForwardFunctor fwd;
2114 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
2115 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
2117 assign( *lhs, fwd( A * B ) );
2135 template<
typename MT
2137 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
2138 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2145 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2159 TDMatTDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2175 template<
typename MT3
2178 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2180 if( ( IsDiagonal_v<MT4> ) ||
2181 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
2182 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
2183 selectSmallAddAssignKernel( C, A, B );
2185 selectBlasAddAssignKernel( C, A, B );
2204 template<
typename MT3
2207 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2208 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2210 const size_t M( A.rows() );
2211 const size_t N( B.columns() );
2212 const size_t K( A.columns() );
2216 for(
size_t j=0UL; j<N; ++j )
2218 const size_t kbegin( ( IsLower_v<MT5> )
2219 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2221 const size_t kend( ( IsUpper_v<MT5> )
2222 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2226 for(
size_t k=kbegin; k<kend; ++k )
2228 const size_t ibegin( ( IsLower_v<MT4> )
2229 ?( ( IsStrictlyLower_v<MT4> )
2230 ?(
LOW ?
max(j,k+1UL) : k+1UL )
2231 :(
LOW ?
max(j,k) : k ) )
2232 :(
LOW ? j : 0UL ) );
2233 const size_t iend( ( IsUpper_v<MT4> )
2234 ?( ( IsStrictlyUpper_v<MT4> )
2235 ?(
UPP ?
min(j+1UL,k) : k )
2236 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
2237 :(
UPP ? j+1UL : M ) );
2239 if( (
LOW ||
UPP ) && ibegin >= iend )
continue;
2242 const size_t inum( iend - ibegin );
2243 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
2246 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2247 C(i ,j) += A(i ,k) * B(k,j);
2248 C(i+1UL,j) += A(i+1UL,k) * B(k,j);
2251 C(ipos,j) += A(ipos,k) * B(k,j);
2273 template<
typename MT3
2276 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2277 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2281 const size_t M( A.rows() );
2282 const size_t N( B.columns() );
2284 for(
size_t j=0UL; j<N; ++j )
2286 const size_t ibegin( ( IsLower_v<MT4> )
2287 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
2289 const size_t iend( ( IsUpper_v<MT4> )
2290 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
2294 const size_t inum( iend - ibegin );
2295 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
2298 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2299 C(i ,j) += A(i ,j) * B(j,j);
2300 C(i+1UL,j) += A(i+1UL,j) * B(j,j);
2303 C(ipos,j) += A(ipos,j) * B(j,j);
2324 template<
typename MT3
2327 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2328 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2332 const size_t M( A.rows() );
2333 const size_t N( B.columns() );
2335 for(
size_t j=0UL; j<N; ++j )
2337 const size_t ibegin( ( IsLower_v<MT5> )
2338 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
2340 const size_t iend( ( IsUpper_v<MT5> )
2341 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
2345 const size_t inum( iend - ibegin );
2346 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
2349 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2350 C(i ,j) += A(i ,i ) * B(i ,j);
2351 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j);
2354 C(ipos,j) += A(ipos,ipos) * B(ipos,j);
2375 template<
typename MT3
2378 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2379 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2383 for(
size_t i=0UL; i<A.rows(); ++i ) {
2384 C(i,i) += A(i,i) * B(i,i);
2404 template<
typename MT3
2407 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2408 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2410 selectDefaultAddAssignKernel( C, A, B );
2430 template<
typename MT3
2433 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2434 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2441 const ForwardFunctor fwd;
2443 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2444 const OppositeType_t<MT5> tmp(
serial( B ) );
2445 addAssign( C, fwd( A * tmp ) );
2447 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2448 const OppositeType_t<MT4> tmp(
serial( A ) );
2449 addAssign( C, fwd( tmp * B ) );
2451 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
2452 const OppositeType_t<MT5> tmp(
serial( B ) );
2453 addAssign( C, fwd( A * tmp ) );
2456 const OppositeType_t<MT4> tmp(
serial( A ) );
2457 addAssign( C, fwd( tmp * B ) );
2478 template<
typename MT3
2481 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2482 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2484 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
2486 const size_t M( A.rows() );
2487 const size_t N( B.columns() );
2488 const size_t K( A.columns() );
2497 if( IsIntegral_v<ElementType> )
2500 for(
size_t j=0UL; j<N; ++j )
2502 const size_t kbegin( ( IsLower_v<MT5> )
2503 ?( ( IsUpper_v<MT4> )
2504 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2506 :( IsUpper_v<MT4> ? i : 0UL ) );
2507 const size_t kend( ( IsUpper_v<MT5> )
2508 ?( ( IsLower_v<MT4> )
2509 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
2510 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
2511 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
2522 for(
size_t k=kbegin; k<kend; ++k ) {
2524 xmm1 += A.load(i ,k) * b1;
2525 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2526 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2527 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2528 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2529 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
2530 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
2531 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
2534 C.store( i , j, xmm1 );
2536 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2537 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2538 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2539 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
2540 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
2541 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
2550 for( ; (j+2UL) <= N; j+=2UL )
2552 const size_t kbegin( ( IsLower_v<MT5> )
2553 ?( ( IsUpper_v<MT4> )
2554 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2555 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2556 :( IsUpper_v<MT4> ? i : 0UL ) );
2557 const size_t kend( ( IsUpper_v<MT5> )
2558 ?( ( IsLower_v<MT4> )
2559 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2560 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2561 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
2568 SIMDType xmm6 ( C.load(i ,j+1UL) );
2574 for(
size_t k=kbegin; k<kend; ++k ) {
2594 C.store( i , j , xmm1 );
2596 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2597 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2598 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
2599 C.store( i , j+1UL, xmm6 );
2600 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
2601 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
2602 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
2603 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
2608 const size_t kbegin( ( IsLower_v<MT5> )
2609 ?( ( IsUpper_v<MT4> )
2610 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2611 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2612 :( IsUpper_v<MT4> ? i : 0UL ) );
2613 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
2621 for(
size_t k=kbegin; k<kend; ++k ) {
2623 xmm1 += A.load(i ,k) * b1;
2624 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2625 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2626 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2627 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
2630 C.store( i , j, xmm1 );
2632 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2633 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2634 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
2642 for( ; (j+2UL) <= N; j+=2UL )
2644 const size_t kbegin( ( IsLower_v<MT5> )
2645 ?( ( IsUpper_v<MT4> )
2646 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2647 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2648 :( IsUpper_v<MT4> ? i : 0UL ) );
2649 const size_t kend( ( IsUpper_v<MT5> )
2650 ?( ( IsLower_v<MT4> )
2651 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2652 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2653 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
2664 for(
size_t k=kbegin; k<kend; ++k ) {
2681 C.store( i , j , xmm1 );
2683 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2684 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
2685 C.store( i , j+1UL, xmm5 );
2686 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
2687 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
2688 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
2693 const size_t kbegin( ( IsLower_v<MT5> )
2694 ?( ( IsUpper_v<MT4> )
2695 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2696 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2697 :( IsUpper_v<MT4> ? i : 0UL ) );
2698 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
2705 for(
size_t k=kbegin; k<kend; ++k ) {
2707 xmm1 += A.load(i ,k) * b1;
2708 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2709 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2710 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
2713 C.store( i , j, xmm1 );
2715 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2716 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
2724 for( ; (j+2UL) <= N; j+=2UL )
2726 const size_t kbegin( ( IsLower_v<MT5> )
2727 ?( ( IsUpper_v<MT4> )
2728 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2729 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2730 :( IsUpper_v<MT4> ? i : 0UL ) );
2731 const size_t kend( ( IsUpper_v<MT5> )
2732 ?( ( IsLower_v<MT4> )
2733 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2734 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2735 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
2744 for(
size_t k=kbegin; k<kend; ++k ) {
2758 C.store( i , j , xmm1 );
2760 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
2761 C.store( i , j+1UL, xmm4 );
2762 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
2763 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
2768 const size_t kbegin( ( IsLower_v<MT5> )
2769 ?( ( IsUpper_v<MT4> )
2770 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2771 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2772 :( IsUpper_v<MT4> ? i : 0UL ) );
2773 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
2779 for(
size_t k=kbegin; k<kend; ++k ) {
2781 xmm1 += A.load(i ,k) * b1;
2782 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
2783 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
2786 C.store( i , j, xmm1 );
2788 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
2795 size_t j(
UPP ? i : 0UL );
2797 for( ; (j+4UL) <= jend; j+=4UL )
2799 const size_t kbegin( ( IsLower_v<MT5> )
2800 ?( ( IsUpper_v<MT4> )
2801 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2802 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2803 :( IsUpper_v<MT4> ? i : 0UL ) );
2804 const size_t kend( ( IsUpper_v<MT5> )
2805 ?( ( IsLower_v<MT4> )
2806 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
2807 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
2808 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2819 for(
size_t k=kbegin; k<kend; ++k ) {
2836 C.store( i , j , xmm1 );
2838 C.store( i , j+1UL, xmm3 );
2839 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2840 C.store( i , j+2UL, xmm5 );
2841 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2842 C.store( i , j+3UL, xmm7 );
2843 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
2846 for( ; (j+3UL) <= jend; j+=3UL )
2848 const size_t kbegin( ( IsLower_v<MT5> )
2849 ?( ( IsUpper_v<MT4> )
2850 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2851 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2852 :( IsUpper_v<MT4> ? i : 0UL ) );
2853 const size_t kend( ( IsUpper_v<MT5> )
2854 ?( ( IsLower_v<MT4> )
2855 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
2856 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
2857 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2866 for(
size_t k=kbegin; k<kend; ++k ) {
2880 C.store( i , j , xmm1 );
2882 C.store( i , j+1UL, xmm3 );
2883 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2884 C.store( i , j+2UL, xmm5 );
2885 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
2888 for( ; (j+2UL) <= jend; j+=2UL )
2890 const size_t kbegin( ( IsLower_v<MT5> )
2891 ?( ( IsUpper_v<MT4> )
2892 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2893 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2894 :( IsUpper_v<MT4> ? i : 0UL ) );
2895 const size_t kend( ( IsUpper_v<MT5> )
2896 ?( ( IsLower_v<MT4> )
2897 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
2898 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
2899 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
2906 for(
size_t k=kbegin; k<kend; ++k ) {
2917 C.store( i , j , xmm1 );
2919 C.store( i , j+1UL, xmm3 );
2920 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
2925 const size_t kbegin( ( IsLower_v<MT5> )
2926 ?( ( IsUpper_v<MT4> )
2927 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2928 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2929 :( IsUpper_v<MT4> ? i : 0UL ) );
2930 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
2935 for(
size_t k=kbegin; k<kend; ++k ) {
2937 xmm1 += A.load(i ,k) * b1;
2941 C.store( i , j, xmm1 );
2949 size_t j(
UPP ? i : 0UL );
2951 for( ; (j+4UL) <= jend; j+=4UL )
2953 const size_t kbegin( ( IsLower_v<MT5> )
2954 ?( ( IsUpper_v<MT4> )
2955 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2956 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2957 :( IsUpper_v<MT4> ? i : 0UL ) );
2958 const size_t kend( ( IsUpper_v<MT5> )
2959 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
2967 for(
size_t k=kbegin; k<kend; ++k ) {
2969 xmm1 += a1 *
set( B(k,j ) );
2970 xmm2 += a1 *
set( B(k,j+1UL) );
2971 xmm3 += a1 *
set( B(k,j+2UL) );
2972 xmm4 += a1 *
set( B(k,j+3UL) );
2975 C.store( i, j , xmm1 );
2976 C.store( i, j+1UL, xmm2 );
2977 C.store( i, j+2UL, xmm3 );
2978 C.store( i, j+3UL, xmm4 );
2981 for( ; (j+3UL) <= jend; j+=3UL )
2983 const size_t kbegin( ( IsLower_v<MT5> )
2984 ?( ( IsUpper_v<MT4> )
2985 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
2986 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
2987 :( IsUpper_v<MT4> ? i : 0UL ) );
2988 const size_t kend( ( IsUpper_v<MT5> )
2989 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
2996 for(
size_t k=kbegin; k<kend; ++k ) {
2998 xmm1 += a1 *
set( B(k,j ) );
2999 xmm2 += a1 *
set( B(k,j+1UL) );
3000 xmm3 += a1 *
set( B(k,j+2UL) );
3003 C.store( i, j , xmm1 );
3004 C.store( i, j+1UL, xmm2 );
3005 C.store( i, j+2UL, xmm3 );
3008 for( ; (j+2UL) <= jend; j+=2UL )
3010 const size_t kbegin( ( IsLower_v<MT5> )
3011 ?( ( IsUpper_v<MT4> )
3012 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3013 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3014 :( IsUpper_v<MT4> ? i : 0UL ) );
3015 const size_t kend( ( IsUpper_v<MT5> )
3016 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3022 for(
size_t k=kbegin; k<kend; ++k ) {
3024 xmm1 += a1 *
set( B(k,j ) );
3025 xmm2 += a1 *
set( B(k,j+1UL) );
3028 C.store( i, j , xmm1 );
3029 C.store( i, j+1UL, xmm2 );
3034 const size_t kbegin( ( IsLower_v<MT5> )
3035 ?( ( IsUpper_v<MT4> )
3036 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3037 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3038 :( IsUpper_v<MT4> ? i : 0UL ) );
3042 for(
size_t k=kbegin; k<K; ++k ) {
3043 xmm1 += A.load(i,k) *
set( B(k,j) );
3046 C.store( i, j, xmm1 );
3050 for( ; remainder && i<M; ++i )
3052 const size_t jend(
LOW ? i+1UL : N );
3053 size_t j(
UPP ? i : 0UL );
3055 for( ; (j+2UL) <= jend; j+=2UL )
3057 const size_t kbegin( ( IsLower_v<MT5> )
3058 ?( ( IsUpper_v<MT4> )
3059 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3060 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3061 :( IsUpper_v<MT4> ? i : 0UL ) );
3062 const size_t kend( ( IsUpper_v<MT5> )
3063 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
3069 for(
size_t k=kbegin; k<kend; ++k ) {
3070 value1 += A(i,k) * B(k,j );
3071 value2 += A(i,k) * B(k,j+1UL);
3075 C(i,j+1UL) = value2;
3080 const size_t kbegin( ( IsLower_v<MT5> )
3081 ?( ( IsUpper_v<MT4> )
3082 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3083 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3084 :( IsUpper_v<MT4> ? i : 0UL ) );
3088 for(
size_t k=kbegin; k<K; ++k ) {
3089 value += A(i,k) * B(k,j);
3113 template<
typename MT3
3116 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3117 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3119 selectDefaultAddAssignKernel( C, A, B );
3139 template<
typename MT3
3142 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3143 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3169 template<
typename MT3
3172 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3173 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3175 selectLargeAddAssignKernel( C, A, B );
3181#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3195 template<
typename MT3
3198 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3199 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3201 using ET = ElementType_t<MT3>;
3203 if( IsTriangular_v<MT4> ) {
3204 ResultType_t<MT3> tmp(
serial( B ) );
3205 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3206 addAssign( C, tmp );
3208 else if( IsTriangular_v<MT5> ) {
3209 ResultType_t<MT3> tmp(
serial( A ) );
3210 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3211 addAssign( C, tmp );
3214 gemm( C, A, B, ET(1), ET(1) );
3236 template<
typename MT >
3238 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3247 const ForwardFunctor fwd;
3249 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
3250 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
3252 addAssign( *lhs, fwd( A * B ) );
3274 template<
typename MT
3276 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
3277 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3284 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3298 TDMatTDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
3314 template<
typename MT3
3317 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3319 if( ( IsDiagonal_v<MT4> ) ||
3320 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
3321 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
3322 selectSmallSubAssignKernel( C, A, B );
3324 selectBlasSubAssignKernel( C, A, B );
3343 template<
typename MT3
3346 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3347 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3349 const size_t M( A.rows() );
3350 const size_t N( B.columns() );
3351 const size_t K( A.columns() );
3355 for(
size_t j=0UL; j<N; ++j )
3357 const size_t kbegin( ( IsLower_v<MT5> )
3358 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3360 const size_t kend( ( IsUpper_v<MT5> )
3361 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3365 for(
size_t k=kbegin; k<kend; ++k )
3367 const size_t ibegin( ( IsLower_v<MT4> )
3368 ?( ( IsStrictlyLower_v<MT4> )
3369 ?(
LOW ?
max(j,k+1UL) : k+1UL )
3370 :(
LOW ?
max(j,k) : k ) )
3371 :(
LOW ? j : 0UL ) );
3372 const size_t iend( ( IsUpper_v<MT4> )
3373 ?( ( IsStrictlyUpper_v<MT4> )
3374 ?(
UPP ?
min(j+1UL,k) : k )
3375 :(
UPP ?
min(j,k)+1UL : k+1UL ) )
3376 :(
UPP ? j+1UL : M ) );
3378 if( (
LOW ||
UPP ) && ( ibegin >= iend ) )
continue;
3381 const size_t inum( iend - ibegin );
3382 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3385 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3386 C(i ,j) -= A(i ,k) * B(k,j);
3387 C(i+1UL,j) -= A(i+1UL,k) * B(k,j);
3390 C(ipos,j) -= A(ipos,k) * B(k,j);
3412 template<
typename MT3
3415 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3416 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3420 const size_t M( A.rows() );
3421 const size_t N( B.columns() );
3423 for(
size_t j=0UL; j<N; ++j )
3425 const size_t ibegin( ( IsLower_v<MT4> )
3426 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
3428 const size_t iend( ( IsUpper_v<MT4> )
3429 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
3433 const size_t inum( iend - ibegin );
3434 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3437 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3438 C(i ,j) -= A(i ,j) * B(j,j);
3439 C(i+1UL,j) -= A(i+1UL,j) * B(j,j);
3442 C(ipos,j) -= A(ipos,j) * B(j,j);
3463 template<
typename MT3
3466 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3467 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3471 const size_t M( A.rows() );
3472 const size_t N( B.columns() );
3474 for(
size_t j=0UL; j<N; ++j )
3476 const size_t ibegin( ( IsLower_v<MT5> )
3477 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
3479 const size_t iend( ( IsUpper_v<MT5> )
3480 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
3484 const size_t inum( iend - ibegin );
3485 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
3488 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
3489 C(i ,j) -= A(i ,i ) * B(i ,j);
3490 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j);
3493 C(ipos,j) -= A(ipos,ipos) * B(ipos,j);
3514 template<
typename MT3
3517 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3518 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3522 for(
size_t i=0UL; i<A.rows(); ++i ) {
3523 C(i,i) -= A(i,i) * B(i,i);
3543 template<
typename MT3
3546 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3547 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3549 selectDefaultSubAssignKernel( C, A, B );
3569 template<
typename MT3
3572 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3573 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3580 const ForwardFunctor fwd;
3582 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3583 const OppositeType_t<MT5> tmp(
serial( B ) );
3584 subAssign( C, fwd( A * tmp ) );
3586 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3587 const OppositeType_t<MT4> tmp(
serial( A ) );
3588 subAssign( C, fwd( tmp * B ) );
3590 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
3591 const OppositeType_t<MT5> tmp(
serial( B ) );
3592 subAssign( C, fwd( A * tmp ) );
3595 const OppositeType_t<MT4> tmp(
serial( A ) );
3596 subAssign( C, fwd( tmp * B ) );
3617 template<
typename MT3
3620 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3621 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3623 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
3625 const size_t M( A.rows() );
3626 const size_t N( B.columns() );
3627 const size_t K( A.columns() );
3636 if( IsIntegral_v<ElementType> )
3639 for(
size_t j=0UL; j<N; ++j )
3641 const size_t kbegin( ( IsLower_v<MT5> )
3642 ?( ( IsUpper_v<MT4> )
3643 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3644 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3645 :( IsUpper_v<MT4> ? i : 0UL ) );
3646 const size_t kend( ( IsUpper_v<MT5> )
3647 ?( ( IsLower_v<MT4> )
3648 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
3649 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
3650 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
3661 for(
size_t k=kbegin; k<kend; ++k ) {
3663 xmm1 -= A.load(i ,k) * b1;
3664 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3665 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3666 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3667 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3668 xmm6 -= A.load(i+
SIMDSIZE*5UL,k) * b1;
3669 xmm7 -= A.load(i+
SIMDSIZE*6UL,k) * b1;
3670 xmm8 -= A.load(i+
SIMDSIZE*7UL,k) * b1;
3673 C.store( i , j, xmm1 );
3675 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3676 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3677 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3678 C.store( i+
SIMDSIZE*5UL, j, xmm6 );
3679 C.store( i+
SIMDSIZE*6UL, j, xmm7 );
3680 C.store( i+
SIMDSIZE*7UL, j, xmm8 );
3689 for( ; (j+2UL) <= N; j+=2UL )
3691 const size_t kbegin( ( IsLower_v<MT5> )
3692 ?( ( IsUpper_v<MT4> )
3693 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3694 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3695 :( IsUpper_v<MT4> ? i : 0UL ) );
3696 const size_t kend( ( IsUpper_v<MT5> )
3697 ?( ( IsLower_v<MT4> )
3698 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3699 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3700 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
3707 SIMDType xmm6 ( C.load(i ,j+1UL) );
3713 for(
size_t k=kbegin; k<kend; ++k ) {
3733 C.store( i , j , xmm1 );
3735 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3736 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3737 C.store( i+
SIMDSIZE*4UL, j , xmm5 );
3738 C.store( i , j+1UL, xmm6 );
3739 C.store( i+
SIMDSIZE , j+1UL, xmm7 );
3740 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 );
3741 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 );
3742 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 );
3747 const size_t kbegin( ( IsLower_v<MT5> )
3748 ?( ( IsUpper_v<MT4> )
3749 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3750 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3751 :( IsUpper_v<MT4> ? i : 0UL ) );
3752 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
3760 for(
size_t k=kbegin; k<kend; ++k ) {
3762 xmm1 -= A.load(i ,k) * b1;
3763 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3764 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3765 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3766 xmm5 -= A.load(i+
SIMDSIZE*4UL,k) * b1;
3769 C.store( i , j, xmm1 );
3771 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3772 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3773 C.store( i+
SIMDSIZE*4UL, j, xmm5 );
3781 for( ; (j+2UL) <= N; j+=2UL )
3783 const size_t kbegin( ( IsLower_v<MT5> )
3784 ?( ( IsUpper_v<MT4> )
3785 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3786 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3787 :( IsUpper_v<MT4> ? i : 0UL ) );
3788 const size_t kend( ( IsUpper_v<MT5> )
3789 ?( ( IsLower_v<MT4> )
3790 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3791 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3792 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
3803 for(
size_t k=kbegin; k<kend; ++k ) {
3820 C.store( i , j , xmm1 );
3822 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3823 C.store( i+
SIMDSIZE*3UL, j , xmm4 );
3824 C.store( i , j+1UL, xmm5 );
3825 C.store( i+
SIMDSIZE , j+1UL, xmm6 );
3826 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 );
3827 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 );
3832 const size_t kbegin( ( IsLower_v<MT5> )
3833 ?( ( IsUpper_v<MT4> )
3834 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3835 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3836 :( IsUpper_v<MT4> ? i : 0UL ) );
3837 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
3844 for(
size_t k=kbegin; k<kend; ++k ) {
3846 xmm1 -= A.load(i ,k) * b1;
3847 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3848 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3849 xmm4 -= A.load(i+
SIMDSIZE*3UL,k) * b1;
3852 C.store( i , j, xmm1 );
3854 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3855 C.store( i+
SIMDSIZE*3UL, j, xmm4 );
3863 for( ; (j+2UL) <= N; j+=2UL )
3865 const size_t kbegin( ( IsLower_v<MT5> )
3866 ?( ( IsUpper_v<MT4> )
3867 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3868 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3869 :( IsUpper_v<MT4> ? i : 0UL ) );
3870 const size_t kend( ( IsUpper_v<MT5> )
3871 ?( ( IsLower_v<MT4> )
3872 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
3873 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
3874 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
3883 for(
size_t k=kbegin; k<kend; ++k ) {
3897 C.store( i , j , xmm1 );
3899 C.store( i+
SIMDSIZE*2UL, j , xmm3 );
3900 C.store( i , j+1UL, xmm4 );
3901 C.store( i+
SIMDSIZE , j+1UL, xmm5 );
3902 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 );
3907 const size_t kbegin( ( IsLower_v<MT5> )
3908 ?( ( IsUpper_v<MT4> )
3909 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3910 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3911 :( IsUpper_v<MT4> ? i : 0UL ) );
3912 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
3918 for(
size_t k=kbegin; k<kend; ++k ) {
3920 xmm1 -= A.load(i ,k) * b1;
3921 xmm2 -= A.load(i+
SIMDSIZE ,k) * b1;
3922 xmm3 -= A.load(i+
SIMDSIZE*2UL,k) * b1;
3925 C.store( i , j, xmm1 );
3927 C.store( i+
SIMDSIZE*2UL, j, xmm3 );
3934 size_t j(
UPP ? i : 0UL );
3936 for( ; (j+4UL) <= jend; j+=4UL )
3938 const size_t kbegin( ( IsLower_v<MT5> )
3939 ?( ( IsUpper_v<MT4> )
3940 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3941 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3942 :( IsUpper_v<MT4> ? i : 0UL ) );
3943 const size_t kend( ( IsUpper_v<MT5> )
3944 ?( ( IsLower_v<MT4> )
3945 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
3946 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
3947 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
3958 for(
size_t k=kbegin; k<kend; ++k ) {
3975 C.store( i , j , xmm1 );
3977 C.store( i , j+1UL, xmm3 );
3978 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
3979 C.store( i , j+2UL, xmm5 );
3980 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
3981 C.store( i , j+3UL, xmm7 );
3982 C.store( i+
SIMDSIZE, j+3UL, xmm8 );
3985 for( ; (j+3UL) <= jend; j+=3UL )
3987 const size_t kbegin( ( IsLower_v<MT5> )
3988 ?( ( IsUpper_v<MT4> )
3989 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
3990 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
3991 :( IsUpper_v<MT4> ? i : 0UL ) );
3992 const size_t kend( ( IsUpper_v<MT5> )
3993 ?( ( IsLower_v<MT4> )
3994 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
3995 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
3996 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4005 for(
size_t k=kbegin; k<kend; ++k ) {
4019 C.store( i , j , xmm1 );
4021 C.store( i , j+1UL, xmm3 );
4022 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4023 C.store( i , j+2UL, xmm5 );
4024 C.store( i+
SIMDSIZE, j+2UL, xmm6 );
4027 for( ; (j+2UL) <= jend; j+=2UL )
4029 const size_t kbegin( ( IsLower_v<MT5> )
4030 ?( ( IsUpper_v<MT4> )
4031 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4032 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4033 :( IsUpper_v<MT4> ? i : 0UL ) );
4034 const size_t kend( ( IsUpper_v<MT5> )
4035 ?( ( IsLower_v<MT4> )
4036 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
4037 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
4038 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
4045 for(
size_t k=kbegin; k<kend; ++k ) {
4056 C.store( i , j , xmm1 );
4058 C.store( i , j+1UL, xmm3 );
4059 C.store( i+
SIMDSIZE, j+1UL, xmm4 );
4064 const size_t kbegin( ( IsLower_v<MT5> )
4065 ?( ( IsUpper_v<MT4> )
4066 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4067 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4068 :( IsUpper_v<MT4> ? i : 0UL ) );
4069 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
4074 for(
size_t k=kbegin; k<kend; ++k ) {
4076 xmm1 -= A.load(i ,k) * b1;
4080 C.store( i , j, xmm1 );
4088 size_t j(
UPP ? i : 0UL );
4090 for( ; (j+4UL) <= jend; j+=4UL )
4092 const size_t kbegin( ( IsLower_v<MT5> )
4093 ?( ( IsUpper_v<MT4> )
4094 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4095 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4096 :( IsUpper_v<MT4> ? i : 0UL ) );
4097 const size_t kend( ( IsUpper_v<MT5> )
4098 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
4106 for(
size_t k=kbegin; k<kend; ++k ) {
4108 xmm1 -= a1 *
set( B(k,j ) );
4109 xmm2 -= a1 *
set( B(k,j+1UL) );
4110 xmm3 -= a1 *
set( B(k,j+2UL) );
4111 xmm4 -= a1 *
set( B(k,j+3UL) );
4114 C.store( i, j , xmm1 );
4115 C.store( i, j+1UL, xmm2 );
4116 C.store( i, j+2UL, xmm3 );
4117 C.store( i, j+3UL, xmm4 );
4120 for( ; (j+3UL) <= jend; j+=3UL )
4122 const size_t kbegin( ( IsLower_v<MT5> )
4123 ?( ( IsUpper_v<MT4> )
4124 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4125 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4126 :( IsUpper_v<MT4> ? i : 0UL ) );
4127 const size_t kend( ( IsUpper_v<MT5> )
4128 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
4135 for(
size_t k=kbegin; k<kend; ++k ) {
4137 xmm1 -= a1 *
set( B(k,j ) );
4138 xmm2 -= a1 *
set( B(k,j+1UL) );
4139 xmm3 -= a1 *
set( B(k,j+2UL) );
4142 C.store( i, j , xmm1 );
4143 C.store( i, j+1UL, xmm2 );
4144 C.store( i, j+2UL, xmm3 );
4147 for( ; (j+2UL) <= jend; j+=2UL )
4149 const size_t kbegin( ( IsLower_v<MT5> )
4150 ?( ( IsUpper_v<MT4> )
4151 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4152 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4153 :( IsUpper_v<MT4> ? i : 0UL ) );
4154 const size_t kend( ( IsUpper_v<MT5> )
4155 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4161 for(
size_t k=kbegin; k<kend; ++k ) {
4163 xmm1 -= a1 *
set( B(k,j ) );
4164 xmm2 -= a1 *
set( B(k,j+1UL) );
4167 C.store( i, j , xmm1 );
4168 C.store( i, j+1UL, xmm2 );
4173 const size_t kbegin( ( IsLower_v<MT5> )
4174 ?( ( IsUpper_v<MT4> )
4175 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4176 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4177 :( IsUpper_v<MT4> ? i : 0UL ) );
4181 for(
size_t k=kbegin; k<K; ++k ) {
4182 xmm1 -= A.load(i,k) *
set( B(k,j) );
4185 C.store( i, j, xmm1 );
4189 for( ; remainder && i<M; ++i )
4191 const size_t jend(
LOW ? i+1UL : N );
4192 size_t j(
UPP ? i : 0UL );
4194 for( ; (j+2UL) <= jend; j+=2UL )
4196 const size_t kbegin( ( IsLower_v<MT5> )
4197 ?( ( IsUpper_v<MT4> )
4198 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4199 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4200 :( IsUpper_v<MT4> ? i : 0UL ) );
4201 const size_t kend( ( IsUpper_v<MT5> )
4202 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
4208 for(
size_t k=kbegin; k<kend; ++k ) {
4209 value1 -= A(i,k) * B(k,j );
4210 value2 -= A(i,k) * B(k,j+1UL);
4214 C(i,j+1UL) = value2;
4219 const size_t kbegin( ( IsLower_v<MT5> )
4220 ?( ( IsUpper_v<MT4> )
4221 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
4222 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
4223 :( IsUpper_v<MT4> ? i : 0UL ) );
4227 for(
size_t k=kbegin; k<K; ++k ) {
4228 value -= A(i,k) * B(k,j);
4252 template<
typename MT3
4255 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4256 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4258 selectDefaultSubAssignKernel( C, A, B );
4278 template<
typename MT3
4281 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4282 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4308 template<
typename MT3
4311 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4312 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4314 selectLargeSubAssignKernel( C, A, B );
4320#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4334 template<
typename MT3
4337 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4338 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4340 using ET = ElementType_t<MT3>;
4342 if( IsTriangular_v<MT4> ) {
4343 ResultType_t<MT3> tmp(
serial( B ) );
4344 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4345 subAssign( C, tmp );
4347 else if( IsTriangular_v<MT5> ) {
4348 ResultType_t<MT3> tmp(
serial( A ) );
4349 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4350 subAssign( C, tmp );
4353 gemm( C, A, B, ET(-1), ET(1) );
4376 template<
typename MT >
4378 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4387 const ForwardFunctor fwd;
4389 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4390 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4392 subAssign( *lhs, fwd( A * B ) );
4414 template<
typename MT
4416 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const TDMatTDMatMultExpr& rhs )
4428 schurAssign( *lhs, tmp );
4457 template<
typename MT
4460 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4467 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
4470 else if( rhs.lhs_.columns() == 0UL ) {
4506 template<
typename MT
4509 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4513 using TmpType = If_t< SO, ResultType, OppositeType >;
4525 const ForwardFunctor fwd;
4527 const TmpType tmp( rhs );
4548 template<
typename MT >
4550 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4559 const ForwardFunctor fwd;
4561 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4562 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4585 template<
typename MT
4588 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4595 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4630 template<
typename MT >
4632 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4641 const ForwardFunctor fwd;
4643 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4644 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4671 template<
typename MT
4674 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4681 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4716 template<
typename MT >
4718 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4727 const ForwardFunctor fwd;
4729 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4730 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4755 template<
typename MT
4815template<
typename MT1
4822class DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >
4823 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, true >, true > >
4824 ,
private Computation
4829 using MMM = TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4831 using RES = ResultType_t<MMM>;
4832 using RT1 = ResultType_t<MT1>;
4833 using RT2 = ResultType_t<MT2>;
4834 using ET1 = ElementType_t<RT1>;
4835 using ET2 = ElementType_t<RT2>;
4836 using CT1 = CompositeType_t<MT1>;
4837 using CT2 = CompositeType_t<MT2>;
4842 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4847 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4851 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4852 static constexpr bool HERM = ( HF && !( LF || UF ) );
4853 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4854 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4863 template<
typename T1,
typename T2,
typename T3 >
4864 static constexpr bool CanExploitSymmetry_v =
4865 ( IsRowMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4873 template<
typename T1,
typename T2,
typename T3 >
4874 static constexpr bool IsEvaluationRequired_v =
4875 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4882 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4883 static constexpr bool UseBlasKernel_v =
4885 !SYM && !HERM && !LOW && !UPP &&
4886 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4887 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4888 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4889 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4890 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4891 IsBLASCompatible_v< ElementType_t<T1> > &&
4892 IsBLASCompatible_v< ElementType_t<T2> > &&
4893 IsBLASCompatible_v< ElementType_t<T3> > &&
4894 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4895 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4896 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4903 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4904 static constexpr bool UseVectorizedDefaultKernel_v =
4905 ( useOptimizedKernels &&
4906 !IsDiagonal_v<T2> &&
4907 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4908 IsSIMDCombinable_v< ElementType_t<T1>
4912 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T2> > &&
4913 HasSIMDMult_v< ElementType_t<T3>, ElementType_t<T3> > );
4920 using ForwardFunctor =
If_t< HERM
4936 using This = DMatScalarMultExpr<MMM,ST,true>;
4939 using BaseType = MatScalarMultExpr< DenseMatrix<This,true> >;
4943 , DeclHermTrait< MultTrait_t<RES,ST> >
4945 , DeclSymTrait< MultTrait_t<RES,ST> >
4948 , DeclDiagTrait< MultTrait_t<RES,ST> >
4949 , DeclLowTrait< MultTrait_t<RES,ST> > >
4951 , DeclUppTrait< MultTrait_t<RES,ST> >
4952 , MultTrait<RES,ST> > > > >::Type;
4957 using SIMDType = SIMDTrait_t<ElementType>;
4962 using LeftOperand =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4968 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4971 using RT = If_t< evaluateRight, const RT2, CT2 >;
4977 ( !IsDiagonal_v<MT1> &&
4978 MT1::simdEnabled && MT2::simdEnabled &&
4979 IsSIMDCombinable_v<ET1,ET2,ST> &&
4980 HasSIMDAdd_v<ET1,ET2> &&
4981 HasSIMDMult_v<ET1,ET2> );
4985 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
5031 if( j >=
matrix_.columns() ) {
5034 return (*
this)(i,j);
5043 inline size_t rows()
const {
5053 inline size_t columns()
const {
5084 template<
typename T >
5085 inline bool canAlias(
const T* alias )
const {
5086 return matrix_.canAlias( alias );
5096 template<
typename T >
5097 inline bool isAliased(
const T* alias )
const {
5098 return matrix_.isAliased( alias );
5119 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5121 (
rows() *
columns() < TDMATTDMATMULT_THRESHOLD ) ) &&
5122 (
rows() *
columns() >= SMP_TDMATTDMATMULT_THRESHOLD );
5144 template<
typename MT
5147 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5154 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5155 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5157 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
5160 else if( left.columns() == 0UL ) {
5175 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
5190 template<
typename MT3
5194 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5196 if( ( IsDiagonal_v<MT4> ) ||
5197 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
5198 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
5199 selectSmallAssignKernel( C, A, B, scalar );
5201 selectBlasAssignKernel( C, A, B, scalar );
5219 template<
typename MT3
5223 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5224 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5226 const size_t M( A.rows() );
5227 const size_t N( B.columns() );
5228 const size_t K( A.columns() );
5232 for(
size_t j=0UL; j<N; ++j )
5234 const size_t kbegin( ( IsLower_v<MT5> )
5235 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5237 const size_t kend( ( IsUpper_v<MT5> )
5238 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5242 if( IsStrictlyTriangular_v<MT5> && kbegin == kend ) {
5243 for(
size_t i=0UL; i<M; ++i ) {
5250 const size_t ibegin( ( IsLower_v<MT4> )
5251 ?( ( IsStrictlyLower_v<MT4> )
5252 ?( LOW ?
max(j,kbegin+1UL) : kbegin+1UL )
5253 :( LOW ?
max(j,kbegin) : kbegin ) )
5254 :( LOW ? j : 0UL ) );
5255 const size_t iend( ( IsUpper_v<MT4> )
5256 ?( ( IsStrictlyUpper_v<MT4> )
5257 ?( UPP ?
min(j+1UL,kbegin) : kbegin )
5258 :( UPP ?
min(j,kbegin)+1UL : kbegin+1UL ) )
5259 :( UPP ? j+1UL : M ) );
5261 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5262 for(
size_t i=0UL; i<ibegin; ++i ) {
5266 else if( IsStrictlyLower_v<MT4> ) {
5269 for(
size_t i=ibegin; i<iend; ++i ) {
5270 C(i,j) = A(i,kbegin) * B(kbegin,j);
5272 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5273 for(
size_t i=iend; i<M; ++i ) {
5277 else if( IsStrictlyUpper_v<MT4> ) {
5278 reset( C(M-1UL,j) );
5282 for(
size_t k=kbegin+1UL; k<kend; ++k )
5284 const size_t ibegin( ( IsLower_v<MT4> )
5285 ?( ( IsStrictlyLower_v<MT4> )
5286 ?( SYM || HERM || LOW ?
max( j, k+1UL ) : k+1UL )
5287 :( SYM || HERM || LOW ?
max( j, k ) : k ) )
5288 :( SYM || HERM || LOW ? j : 0UL ) );
5289 const size_t iend( ( IsUpper_v<MT4> )
5290 ?( ( IsStrictlyUpper_v<MT4> )
5291 ?( UPP ?
min(j+1UL,k-1UL) : k-1UL )
5292 :( UPP ?
min(j+1UL,k) : k ) )
5293 :( UPP ? j+1UL : M ) );
5295 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5298 for(
size_t i=ibegin; i<iend; ++i ) {
5299 C(i,j) += A(i,k) * B(k,j);
5301 if( IsUpper_v<MT4> ) {
5302 C(iend,j) = A(iend,k) * B(k,j);
5307 const size_t ibegin( ( IsLower_v<MT4> && IsLower_v<MT5> )
5308 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? j+1UL : j )
5309 :( ( SYM || HERM || LOW )?( j ):( 0UL ) ) );
5310 const size_t iend( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5311 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? j : j+1UL )
5312 :( UPP ? j+1UL : M ) );
5314 if( ( SYM || HERM || LOW || UPP ) && ( ibegin > iend ) )
continue;
5317 for(
size_t i=ibegin; i<iend; ++i ) {
5324 for(
size_t j=1UL; j<N; ++j ) {
5325 for(
size_t i=0UL; i<j; ++i ) {
5326 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5347 template<
typename MT3
5351 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5352 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5356 const size_t M( A.rows() );
5357 const size_t N( B.columns() );
5359 for(
size_t j=0UL; j<N; ++j )
5361 const size_t ibegin( ( IsLower_v<MT4> )
5362 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
5364 const size_t iend( ( IsUpper_v<MT4> )
5365 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
5369 if( IsLower_v<MT4> ) {
5370 for(
size_t i=0UL; i<ibegin; ++i ) {
5374 for(
size_t i=ibegin; i<iend; ++i ) {
5375 C(i,j) = A(i,j) * B(j,j) * scalar;
5377 if( IsUpper_v<MT4> ) {
5378 for(
size_t i=iend; i<M; ++i ) {
5400 template<
typename MT3
5404 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5405 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5409 const size_t M( A.rows() );
5410 const size_t N( B.columns() );
5412 for(
size_t j=0UL; j<N; ++j )
5414 const size_t ibegin( ( IsLower_v<MT5> )
5415 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
5417 const size_t iend( ( IsUpper_v<MT5> )
5418 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
5422 if( IsLower_v<MT4> ) {
5423 for(
size_t i=0UL; i<ibegin; ++i ) {
5427 for(
size_t i=ibegin; i<iend; ++i ) {
5428 C(i,j) = A(i,i) * B(i,j) * scalar;
5430 if( IsUpper_v<MT4> ) {
5431 for(
size_t i=iend; i<M; ++i ) {
5453 template<
typename MT3
5457 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5458 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5464 for(
size_t i=0UL; i<A.rows(); ++i ) {
5465 C(i,i) = A(i,i) * B(i,i) * scalar;
5484 template<
typename MT3
5488 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5489 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5491 selectDefaultAssignKernel( C, A, B, scalar );
5510 template<
typename MT3
5514 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5515 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5522 const ForwardFunctor fwd;
5524 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
5525 const OppositeType_t<MT5> tmp(
serial( B ) );
5526 assign( C, fwd( A * tmp ) * scalar );
5528 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
5529 const OppositeType_t<MT4> tmp(
serial( A ) );
5530 assign( C, fwd( tmp * B ) * scalar );
5532 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
5533 const OppositeType_t<MT5> tmp(
serial( B ) );
5534 assign( C, fwd( A * tmp ) * scalar );
5537 const OppositeType_t<MT4> tmp(
serial( A ) );
5538 assign( C, fwd( tmp * B ) * scalar );
5558 template<
typename MT3
5562 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5563 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5565 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
5567 const size_t M( A.rows() );
5568 const size_t N( B.columns() );
5569 const size_t K( A.columns() );
5576 const SIMDType factor(
set( scalar ) );
5580 if( IsIntegral_v<ElementType> )
5582 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*7UL) < ipos; i+=
SIMDSIZE*8UL ) {
5583 for(
size_t j=0UL; j<N; ++j )
5585 const size_t kbegin( ( IsLower_v<MT5> )
5586 ?( ( IsUpper_v<MT4> )
5587 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5588 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5589 :( IsUpper_v<MT4> ? i : 0UL ) );
5590 const size_t kend( ( IsUpper_v<MT5> )
5591 ?( ( IsLower_v<MT4> )
5592 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
5593 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
5594 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
5600 SIMDType b1(
set( B(k,j) ) );
5601 SIMDType xmm1( A.load(i ,k) * b1 );
5602 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
5603 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
5604 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
5605 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
5606 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
5607 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
5608 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
5610 for( ++k; k<kend; ++k ) {
5612 xmm1 += A.load(i ,k) * b1;
5613 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5614 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5615 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5616 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5617 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
5618 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
5619 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
5622 C.store( i , j, xmm1 * factor );
5623 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5624 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5625 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5626 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5627 C.store( i+
SIMDSIZE*5UL, j, xmm6 * factor );
5628 C.store( i+
SIMDSIZE*6UL, j, xmm7 * factor );
5629 C.store( i+
SIMDSIZE*7UL, j, xmm8 * factor );
5633 const SIMDType
zero;
5634 C.store( i , j,
zero );
5647 for( ; !SYM && !HERM && !LOW && !UPP && (i+
SIMDSIZE*4UL) < ipos; i+=
SIMDSIZE*5UL )
5651 for( ; (j+2UL) <= N; j+=2UL )
5653 const size_t kbegin( ( IsLower_v<MT5> )
5654 ?( ( IsUpper_v<MT4> )
5655 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5656 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5657 :( IsUpper_v<MT4> ? i : 0UL ) );
5658 const size_t kend( ( IsUpper_v<MT5> )
5659 ?( ( IsLower_v<MT4> )
5660 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5661 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5662 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
5668 SIMDType a1( A.load(i ,k) );
5669 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5670 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5671 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5672 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
5673 SIMDType b1(
set( B(k,j ) ) );
5674 SIMDType b2(
set( B(k,j+1UL) ) );
5675 SIMDType xmm1 ( a1 * b1 );
5676 SIMDType xmm2 ( a2 * b1 );
5677 SIMDType xmm3 ( a3 * b1 );
5678 SIMDType xmm4 ( a4 * b1 );
5679 SIMDType xmm5 ( a5 * b1 );
5680 SIMDType xmm6 ( a1 * b2 );
5681 SIMDType xmm7 ( a2 * b2 );
5682 SIMDType xmm8 ( a3 * b2 );
5683 SIMDType xmm9 ( a4 * b2 );
5684 SIMDType xmm10( a5 * b2 );
5686 for( ++k; k<kend; ++k ) {
5692 b1 =
set( B(k,j ) );
5693 b2 =
set( B(k,j+1UL) );
5706 C.store( i , j , xmm1 * factor );
5707 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5708 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5709 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5710 C.store( i+
SIMDSIZE*4UL, j , xmm5 * factor );
5711 C.store( i , j+1UL, xmm6 * factor );
5712 C.store( i+
SIMDSIZE , j+1UL, xmm7 * factor );
5713 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm8 * factor );
5714 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm9 * factor );
5715 C.store( i+
SIMDSIZE*4UL, j+1UL, xmm10 * factor );
5719 const SIMDType
zero;
5720 C.store( i , j ,
zero );
5725 C.store( i , j+1UL,
zero );
5735 const size_t kbegin( ( IsLower_v<MT5> )
5736 ?( ( IsUpper_v<MT4> )
5737 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5738 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5739 :( IsUpper_v<MT4> ? i : 0UL ) );
5740 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
5746 SIMDType b1(
set( B(k,j) ) );
5747 SIMDType xmm1( A.load(i ,k) * b1 );
5748 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
5749 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
5750 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
5751 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
5753 for( ++k; k<kend; ++k ) {
5755 xmm1 += A.load(i ,k) * b1;
5756 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5757 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5758 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5759 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
5762 C.store( i , j, xmm1 * factor );
5763 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5764 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5765 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5766 C.store( i+
SIMDSIZE*4UL, j, xmm5 * factor );
5770 const SIMDType
zero;
5771 C.store( i , j,
zero );
5782 const size_t jend( LOW ?
min(i+
SIMDSIZE*4UL,N) : N );
5788 for(
size_t ii=i; ii<iiend; ++ii ) {
5789 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
5796 for(
size_t ii=i; ii<iiend; ++ii ) {
5802 for( ; (j+2UL) <= jend; j+=2UL )
5804 const size_t kbegin( ( IsLower_v<MT5> )
5805 ?( ( IsUpper_v<MT4> )
5806 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5807 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5808 :( IsUpper_v<MT4> ? i : 0UL ) );
5809 const size_t kend( ( IsUpper_v<MT5> )
5810 ?( ( IsLower_v<MT4> )
5811 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5812 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5813 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
5819 SIMDType a1( A.load(i ,k) );
5820 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5821 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5822 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
5823 SIMDType b1(
set( B(k,j ) ) );
5824 SIMDType b2(
set( B(k,j+1UL) ) );
5825 SIMDType xmm1( a1 * b1 );
5826 SIMDType xmm2( a2 * b1 );
5827 SIMDType xmm3( a3 * b1 );
5828 SIMDType xmm4( a4 * b1 );
5829 SIMDType xmm5( a1 * b2 );
5830 SIMDType xmm6( a2 * b2 );
5831 SIMDType xmm7( a3 * b2 );
5832 SIMDType xmm8( a4 * b2 );
5834 for( ++k; k<kend; ++k ) {
5839 b1 =
set( B(k,j ) );
5840 b2 =
set( B(k,j+1UL) );
5851 C.store( i , j , xmm1 * factor );
5852 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5853 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5854 C.store( i+
SIMDSIZE*3UL, j , xmm4 * factor );
5855 C.store( i , j+1UL, xmm5 * factor );
5856 C.store( i+
SIMDSIZE , j+1UL, xmm6 * factor );
5857 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm7 * factor );
5858 C.store( i+
SIMDSIZE*3UL, j+1UL, xmm8 * factor );
5862 const SIMDType
zero;
5863 C.store( i , j ,
zero );
5867 C.store( i , j+1UL,
zero );
5876 const size_t kbegin( ( IsLower_v<MT5> )
5877 ?( ( IsUpper_v<MT4> )
5878 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5879 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5880 :( IsUpper_v<MT4> ? i : 0UL ) );
5881 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
5887 SIMDType b1(
set( B(k,j) ) );
5888 SIMDType xmm1( A.load(i ,k) * b1 );
5889 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
5890 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
5891 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
5893 for( ++k; k<kend; ++k ) {
5895 xmm1 += A.load(i ,k) * b1;
5896 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
5897 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
5898 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
5901 C.store( i , j, xmm1 * factor );
5902 C.store( i+
SIMDSIZE , j, xmm2 * factor );
5903 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
5904 C.store( i+
SIMDSIZE*3UL, j, xmm4 * factor );
5908 const SIMDType
zero;
5909 C.store( i , j,
zero );
5921 for(
size_t ii=i; ii<iiend; ++ii ) {
5930 const size_t jend( LOW ?
min(i+
SIMDSIZE*3UL,N) : N );
5936 for(
size_t ii=i; ii<iiend; ++ii ) {
5937 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
5944 for(
size_t ii=i; ii<iiend; ++ii ) {
5950 for( ; (j+2UL) <= jend; j+=2UL )
5952 const size_t kbegin( ( IsLower_v<MT5> )
5953 ?( ( IsUpper_v<MT4> )
5954 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
5955 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
5956 :( IsUpper_v<MT4> ? i : 0UL ) );
5957 const size_t kend( ( IsUpper_v<MT5> )
5958 ?( ( IsLower_v<MT4> )
5959 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
5960 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
5961 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
5967 SIMDType a1( A.load(i ,k) );
5968 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
5969 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
5970 SIMDType b1(
set( B(k,j ) ) );
5971 SIMDType b2(
set( B(k,j+1UL) ) );
5972 SIMDType xmm1( a1 * b1 );
5973 SIMDType xmm2( a2 * b1 );
5974 SIMDType xmm3( a3 * b1 );
5975 SIMDType xmm4( a1 * b2 );
5976 SIMDType xmm5( a2 * b2 );
5977 SIMDType xmm6( a3 * b2 );
5979 for( ++k; k<kend; ++k ) {
5983 b1 =
set( B(k,j ) );
5984 b2 =
set( B(k,j+1UL) );
5993 C.store( i , j , xmm1 * factor );
5994 C.store( i+
SIMDSIZE , j , xmm2 * factor );
5995 C.store( i+
SIMDSIZE*2UL, j , xmm3 * factor );
5996 C.store( i , j+1UL, xmm4 * factor );
5997 C.store( i+
SIMDSIZE , j+1UL, xmm5 * factor );
5998 C.store( i+
SIMDSIZE*2UL, j+1UL, xmm6 * factor );
6002 const SIMDType
zero;
6003 C.store( i , j ,
zero );
6006 C.store( i , j+1UL,
zero );
6014 const size_t kbegin( ( IsLower_v<MT5> )
6015 ?( ( IsUpper_v<MT4> )
6016 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6017 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6018 :( IsUpper_v<MT4> ? i : 0UL ) );
6019 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
6025 SIMDType b1(
set( B(k,j) ) );
6026 SIMDType xmm1( A.load(i ,k) * b1 );
6027 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
6028 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
6030 for( ++k; k<kend; ++k ) {
6032 xmm1 += A.load(i ,k) * b1;
6033 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
6034 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
6037 C.store( i , j, xmm1 * factor );
6038 C.store( i+
SIMDSIZE , j, xmm2 * factor );
6039 C.store( i+
SIMDSIZE*2UL, j, xmm3 * factor );
6043 const SIMDType
zero;
6044 C.store( i , j,
zero );
6055 for(
size_t ii=i; ii<iiend; ++ii ) {
6064 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
6070 for(
size_t ii=i; ii<iiend; ++ii ) {
6071 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
6078 for(
size_t ii=i; ii<iiend; ++ii ) {
6084 for( ; (j+4UL) <= jend; j+=4UL )
6086 const size_t kbegin( ( IsLower_v<MT5> )
6087 ?( ( IsUpper_v<MT4> )
6088 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6089 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6090 :( IsUpper_v<MT4> ? i : 0UL ) );
6091 const size_t kend( ( IsUpper_v<MT5> )
6092 ?( ( IsLower_v<MT4> )
6093 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
6094 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
6095 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6101 SIMDType a1( A.load(i ,k) );
6102 SIMDType a2( A.load(i+
SIMDSIZE,k) );
6103 SIMDType b1(
set( B(k,j ) ) );
6104 SIMDType b2(
set( B(k,j+1UL) ) );
6105 SIMDType b3(
set( B(k,j+2UL) ) );
6106 SIMDType b4(
set( B(k,j+3UL) ) );
6107 SIMDType xmm1( a1 * b1 );
6108 SIMDType xmm2( a2 * b1 );
6109 SIMDType xmm3( a1 * b2 );
6110 SIMDType xmm4( a2 * b2 );
6111 SIMDType xmm5( a1 * b3 );
6112 SIMDType xmm6( a2 * b3 );
6113 SIMDType xmm7( a1 * b4 );
6114 SIMDType xmm8( a2 * b4 );
6116 for( ++k; k<kend; ++k ) {
6119 b1 =
set( B(k,j ) );
6120 b2 =
set( B(k,j+1UL) );
6121 b3 =
set( B(k,j+2UL) );
6122 b4 =
set( B(k,j+3UL) );
6133 C.store( i , j , xmm1 * factor );
6134 C.store( i+
SIMDSIZE, j , xmm2 * factor );
6135 C.store( i , j+1UL, xmm3 * factor );
6136 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
6137 C.store( i , j+2UL, xmm5 * factor );
6138 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
6139 C.store( i , j+3UL, xmm7 * factor );
6140 C.store( i+
SIMDSIZE, j+3UL, xmm8 * factor );
6144 const SIMDType
zero;
6145 C.store( i , j ,
zero );
6147 C.store( i , j+1UL,
zero );
6149 C.store( i , j+2UL,
zero );
6151 C.store( i , j+3UL,
zero );
6156 for( ; (j+3UL) <= jend; j+=3UL )
6158 const size_t kbegin( ( IsLower_v<MT5> )
6159 ?( ( IsUpper_v<MT4> )
6160 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6161 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6162 :( IsUpper_v<MT4> ? i : 0UL ) );
6163 const size_t kend( ( IsUpper_v<MT5> )
6164 ?( ( IsLower_v<MT4> )
6165 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
6166 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
6167 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6173 SIMDType a1( A.load(i ,k) );
6174 SIMDType a2( A.load(i+
SIMDSIZE,k) );
6175 SIMDType b1(
set( B(k,j ) ) );
6176 SIMDType b2(
set( B(k,j+1UL) ) );
6177 SIMDType b3(
set( B(k,j+2UL) ) );
6178 SIMDType xmm1( a1 * b1 );
6179 SIMDType xmm2( a2 * b1 );
6180 SIMDType xmm3( a1 * b2 );
6181 SIMDType xmm4( a2 * b2 );
6182 SIMDType xmm5( a1 * b3 );
6183 SIMDType xmm6( a2 * b3 );
6185 for( ++k; k<kend; ++k ) {
6188 b1 =
set( B(k,j ) );
6189 b2 =
set( B(k,j+1UL) );
6190 b3 =
set( B(k,j+2UL) );
6199 C.store( i , j , xmm1 * factor );
6200 C.store( i+
SIMDSIZE, j , xmm2 * factor );
6201 C.store( i , j+1UL, xmm3 * factor );
6202 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
6203 C.store( i , j+2UL, xmm5 * factor );
6204 C.store( i+
SIMDSIZE, j+2UL, xmm6 * factor );
6208 const SIMDType
zero;
6209 C.store( i , j ,
zero );
6211 C.store( i , j+1UL,
zero );
6213 C.store( i , j+2UL,
zero );
6218 for( ; (j+2UL) <= jend; j+=2UL )
6220 const size_t kbegin( ( IsLower_v<MT5> )
6221 ?( ( IsUpper_v<MT4> )
6222 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6223 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6224 :( IsUpper_v<MT4> ? i : 0UL ) );
6225 const size_t kend( ( IsUpper_v<MT5> )
6226 ?( ( IsLower_v<MT4> )
6227 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
6228 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
6229 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
6235 SIMDType a1( A.load(i ,k) );
6236 SIMDType a2( A.load(i+
SIMDSIZE,k) );
6237 SIMDType b1(
set( B(k,j ) ) );
6238 SIMDType b2(
set( B(k,j+1UL) ) );
6239 SIMDType xmm1( a1 * b1 );
6240 SIMDType xmm2( a2 * b1 );
6241 SIMDType xmm3( a1 * b2 );
6242 SIMDType xmm4( a2 * b2 );
6244 for( ++k; k<kend; ++k ) {
6247 b1 =
set( B(k,j ) );
6248 b2 =
set( B(k,j+1UL) );
6255 C.store( i , j , xmm1 * factor );
6256 C.store( i+
SIMDSIZE, j , xmm2 * factor );
6257 C.store( i , j+1UL, xmm3 * factor );
6258 C.store( i+
SIMDSIZE, j+1UL, xmm4 * factor );
6262 const SIMDType
zero;
6263 C.store( i , j ,
zero );
6265 C.store( i , j+1UL,
zero );
6272 const size_t kbegin( ( IsLower_v<MT5> )
6273 ?( ( IsUpper_v<MT4> )
6274 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6275 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6276 :( IsUpper_v<MT4> ? i : 0UL ) );
6277 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
6283 SIMDType b1(
set( B(k,j) ) );
6284 SIMDType xmm1( A.load(i ,k) * b1 );
6285 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
6287 for( ++k; k<kend; ++k ) {
6289 xmm1 += A.load(i ,k) * b1;
6293 C.store( i , j, xmm1 * factor );
6294 C.store( i+
SIMDSIZE, j, xmm2 * factor );
6298 const SIMDType
zero;
6299 C.store( i , j,
zero );
6309 for(
size_t ii=i; ii<iiend; ++ii ) {
6324 for(
size_t ii=i; ii<iiend; ++ii ) {
6325 C(ii,j) = HERM ?
conj( C(j,ii) ) : C(j,ii);
6332 for(
size_t ii=i; ii<iiend; ++ii ) {
6338 for( ; (j+4UL) <= jend; j+=4UL )
6340 const size_t kbegin( ( IsLower_v<MT5> )
6341 ?( ( IsUpper_v<MT4> )
6342 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6343 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6344 :( IsUpper_v<MT4> ? i : 0UL ) );
6345 const size_t kend( ( IsUpper_v<MT5> )
6346 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
6353 SIMDType a1( A.load(i,k) );
6354 SIMDType xmm1( a1 *
set( B(k,j ) ) );
6355 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
6356 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
6357 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
6359 for( ++k; k<kend; ++k ) {
6361 xmm1 += a1 *
set( B(k,j ) );
6362 xmm2 += a1 *
set( B(k,j+1UL) );
6363 xmm3 += a1 *
set( B(k,j+2UL) );
6364 xmm4 += a1 *
set( B(k,j+3UL) );
6367 C.store( i, j , xmm1 * factor );
6368 C.store( i, j+1UL, xmm2 * factor );
6369 C.store( i, j+2UL, xmm3 * factor );
6370 C.store( i, j+3UL, xmm4 * factor );
6374 const SIMDType
zero;
6375 C.store( i, j ,
zero );
6376 C.store( i, j+1UL,
zero );
6377 C.store( i, j+2UL,
zero );
6378 C.store( i, j+3UL,
zero );
6382 for( ; (j+3UL) <= jend; j+=3UL )
6384 const size_t kbegin( ( IsLower_v<MT5> )
6385 ?( ( IsUpper_v<MT4> )
6386 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6387 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6388 :( IsUpper_v<MT4> ? i : 0UL ) );
6389 const size_t kend( ( IsUpper_v<MT5> )
6390 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
6397 SIMDType a1( A.load(i,k) );
6398 SIMDType xmm1( a1 *
set( B(k,j ) ) );
6399 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
6400 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
6402 for( ++k; k<kend; ++k ) {
6404 xmm1 += a1 *
set( B(k,j ) );
6405 xmm2 += a1 *
set( B(k,j+1UL) );
6406 xmm3 += a1 *
set( B(k,j+2UL) );
6409 C.store( i, j , xmm1 * factor );
6410 C.store( i, j+1UL, xmm2 * factor );
6411 C.store( i, j+2UL, xmm3 * factor );
6415 const SIMDType
zero;
6416 C.store( i, j ,
zero );
6417 C.store( i, j+1UL,
zero );
6418 C.store( i, j+2UL,
zero );
6422 for( ; (j+2UL) <= jend; j+=2UL )
6424 const size_t kbegin( ( IsLower_v<MT5> )
6425 ?( ( IsUpper_v<MT4> )
6426 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6427 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6428 :( IsUpper_v<MT4> ? i : 0UL ) );
6429 const size_t kend( ( IsUpper_v<MT5> )
6430 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6437 SIMDType a1( A.load(i,k) );
6438 SIMDType xmm1( a1 *
set( B(k,j ) ) );
6439 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
6441 for( ++k; k<kend; ++k ) {
6443 xmm1 += a1 *
set( B(k,j ) );
6444 xmm2 += a1 *
set( B(k,j+1UL) );
6447 C.store( i, j , xmm1 * factor );
6448 C.store( i, j+1UL, xmm2 * factor );
6452 const SIMDType
zero;
6453 C.store( i, j ,
zero );
6454 C.store( i, j+1UL,
zero );
6460 const size_t kbegin( ( IsLower_v<MT5> )
6461 ?( ( IsUpper_v<MT4> )
6462 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6463 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6464 :( IsUpper_v<MT4> ? i : 0UL ) );
6470 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
6472 for( ++k; k<K; ++k ) {
6473 xmm1 += A.load(i,k) *
set( B(k,j) );
6476 C.store( i, j, xmm1 * factor );
6480 const SIMDType
zero;
6481 C.store( i, j,
zero );
6490 for(
size_t ii=i; ii<iiend; ++ii ) {
6497 for( ; remainder && i<M; ++i )
6503 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
6512 for( ; (j+2UL) <= N; j+=2UL )
6514 const size_t kbegin( ( IsLower_v<MT5> )
6515 ?( ( IsUpper_v<MT4> )
6516 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6517 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6518 :( IsUpper_v<MT4> ? i : 0UL ) );
6519 const size_t kend( ( IsUpper_v<MT5> )
6520 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
6530 for( ++k; k<kend; ++k ) {
6531 value1 += A(i,k) * B(k,j );
6532 value2 += A(i,k) * B(k,j+1UL);
6535 C(i,j ) = value1 * scalar;
6536 C(i,j+1UL) = value2 * scalar;
6541 reset( C(i,j+1UL) );
6547 const size_t kbegin( ( IsLower_v<MT5> )
6548 ?( ( IsUpper_v<MT4> )
6549 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
6550 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
6551 :( IsUpper_v<MT4> ? i : 0UL ) );
6559 for( ++k; k<K; ++k ) {
6560 value += A(i,k) * B(k,j);
6563 C(i,j) = value * scalar;
6588 template<
typename MT3
6592 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6593 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6595 selectDefaultAssignKernel( C, A, B, scalar );
6614 template<
typename MT3
6618 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6619 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6622 smmm( C, A, B, scalar );
6624 hmmm( C, A, B, scalar );
6626 lmmm( C, A, B, scalar, ST2(0) );
6628 ummm( C, A, B, scalar, ST2(0) );
6630 mmm( C, A, B, scalar, ST2(0) );
6648 template<
typename MT3
6652 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6653 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6655 selectLargeAssignKernel( C, A, B, scalar );
6660#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6674 template<
typename MT3
6678 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6679 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6681 using ET = ElementType_t<MT3>;
6683 if( IsTriangular_v<MT4> ) {
6685 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6687 else if( IsTriangular_v<MT5> ) {
6689 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6692 gemm( C, A, B,
ET(scalar),
ET(0) );
6710 template<
typename MT
6713 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6717 using TmpType = If_t< SO, ResultType, OppositeType >;
6729 const ForwardFunctor fwd;
6731 const TmpType tmp(
serial( rhs ) );
6732 assign( *lhs, fwd( tmp ) );
6750 template<
typename MT >
6752 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6761 const ForwardFunctor fwd;
6763 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
6764 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
6766 assign( *lhs, fwd( A * B ) * rhs.scalar_ );
6782 template<
typename MT
6784 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6785 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6792 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6793 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6795 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
6809 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
6824 template<
typename MT3
6828 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6830 if( ( IsDiagonal_v<MT4> ) ||
6831 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
6832 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
6833 selectSmallAddAssignKernel( C, A, B, scalar );
6835 selectBlasAddAssignKernel( C, A, B, scalar );
6853 template<
typename MT3
6857 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6858 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6861 addAssign( C, tmp );
6879 template<
typename MT3
6883 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6884 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6888 const size_t M( A.rows() );
6889 const size_t N( B.columns() );
6891 for(
size_t j=0UL; j<N; ++j )
6893 const size_t ibegin( ( IsLower_v<MT4> )
6894 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
6896 const size_t iend( ( IsUpper_v<MT4> )
6897 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
6901 const size_t inum( iend - ibegin );
6902 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
6905 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6906 C(i ,j) += A(i ,j) * B(j,j) * scalar;
6907 C(i+1UL,j) += A(i+1UL,j) * B(j,j) * scalar;
6910 C(ipos,j) += A(ipos,j) * B(j,j) * scalar;
6930 template<
typename MT3
6934 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6935 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6939 const size_t M( A.rows() );
6940 const size_t N( B.columns() );
6942 for(
size_t j=0UL; j<N; ++j )
6944 const size_t ibegin( ( IsLower_v<MT5> )
6945 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
6947 const size_t iend( ( IsUpper_v<MT5> )
6948 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
6952 const size_t inum( iend - ibegin );
6953 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
6956 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
6957 C(i ,j) += A(i ,i ) * B(i ,j) * scalar;
6958 C(i+1UL,j) += A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
6961 C(ipos,j) += A(ipos,ipos) * B(ipos,j) * scalar;
6981 template<
typename MT3
6985 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6986 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6990 for(
size_t i=0UL; i<A.rows(); ++i ) {
6991 C(i,i) += A(i,i) * B(i,i) * scalar;
7010 template<
typename MT3
7014 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7015 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7017 selectDefaultAddAssignKernel( C, A, B, scalar );
7036 template<
typename MT3
7040 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7041 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7048 const ForwardFunctor fwd;
7050 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7051 const OppositeType_t<MT5> tmp(
serial( B ) );
7052 addAssign( C, fwd( A * tmp ) * scalar );
7054 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7055 const OppositeType_t<MT4> tmp(
serial( A ) );
7056 addAssign( C, fwd( tmp * B ) * scalar );
7058 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
7059 const OppositeType_t<MT5> tmp(
serial( B ) );
7060 addAssign( C, fwd( A * tmp ) * scalar );
7063 const OppositeType_t<MT4> tmp(
serial( A ) );
7064 addAssign( C, fwd( tmp * B ) * scalar );
7084 template<
typename MT3
7088 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7089 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7091 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
7093 const size_t M( A.rows() );
7094 const size_t N( B.columns() );
7095 const size_t K( A.columns() );
7102 const SIMDType factor(
set( scalar ) );
7106 if( IsIntegral_v<ElementType> )
7109 for(
size_t j=0UL; j<N; ++j )
7111 const size_t kbegin( ( IsLower_v<MT5> )
7112 ?( ( IsUpper_v<MT4> )
7113 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7114 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7115 :( IsUpper_v<MT4> ? i : 0UL ) );
7116 const size_t kend( ( IsUpper_v<MT5> )
7117 ?( ( IsLower_v<MT4> )
7118 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
7119 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
7120 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
7126 SIMDType b1(
set( B(k,j) ) );
7127 SIMDType xmm1( A.load(i ,k) * b1 );
7128 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
7129 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
7130 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
7131 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
7132 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
7133 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
7134 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
7136 for( ++k; k<kend; ++k ) {
7138 xmm1 += A.load(i ,k) * b1;
7139 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7140 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7141 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7142 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7143 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
7144 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
7145 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
7148 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7165 for( ; (j+2UL) <= N; j+=2UL )
7167 const size_t kbegin( ( IsLower_v<MT5> )
7168 ?( ( IsUpper_v<MT4> )
7169 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7170 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7171 :( IsUpper_v<MT4> ? i : 0UL ) );
7172 const size_t kend( ( IsUpper_v<MT5> )
7173 ?( ( IsLower_v<MT4> )
7174 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7175 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7176 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
7182 SIMDType a1( A.load(i ,k) );
7183 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7184 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7185 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
7186 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
7187 SIMDType b1(
set( B(k,j ) ) );
7188 SIMDType b2(
set( B(k,j+1UL) ) );
7189 SIMDType xmm1 ( a1 * b1 );
7190 SIMDType xmm2 ( a2 * b1 );
7191 SIMDType xmm3 ( a3 * b1 );
7192 SIMDType xmm4 ( a4 * b1 );
7193 SIMDType xmm5 ( a5 * b1 );
7194 SIMDType xmm6 ( a1 * b2 );
7195 SIMDType xmm7 ( a2 * b2 );
7196 SIMDType xmm8 ( a3 * b2 );
7197 SIMDType xmm9 ( a4 * b2 );
7198 SIMDType xmm10( a5 * b2 );
7200 for( ++k; k<kend; ++k ) {
7206 b1 =
set( B(k,j ) );
7207 b2 =
set( B(k,j+1UL) );
7220 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7225 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm6 * factor );
7227 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm8 * factor );
7228 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm9 * factor );
7229 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) + xmm10 * factor );
7235 const size_t kbegin( ( IsLower_v<MT5> )
7236 ?( ( IsUpper_v<MT4> )
7237 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7238 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7239 :( IsUpper_v<MT4> ? i : 0UL ) );
7240 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
7246 SIMDType b1(
set( B(k,j) ) );
7247 SIMDType xmm1( A.load(i ,k) * b1 );
7248 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
7249 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
7250 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
7251 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
7253 for( ++k; k<kend; ++k ) {
7255 xmm1 += A.load(i ,k) * b1;
7256 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7257 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7258 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7259 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
7262 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7275 for( ; (j+2UL) <= N; j+=2UL )
7277 const size_t kbegin( ( IsLower_v<MT5> )
7278 ?( ( IsUpper_v<MT4> )
7279 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7280 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7281 :( IsUpper_v<MT4> ? i : 0UL ) );
7282 const size_t kend( ( IsUpper_v<MT5> )
7283 ?( ( IsLower_v<MT4> )
7284 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7285 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7286 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
7292 SIMDType a1( A.load(i ,k) );
7293 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7294 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7295 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
7296 SIMDType b1(
set( B(k,j ) ) );
7297 SIMDType b2(
set( B(k,j+1UL) ) );
7298 SIMDType xmm1( a1 * b1 );
7299 SIMDType xmm2( a2 * b1 );
7300 SIMDType xmm3( a3 * b1 );
7301 SIMDType xmm4( a4 * b1 );
7302 SIMDType xmm5( a1 * b2 );
7303 SIMDType xmm6( a2 * b2 );
7304 SIMDType xmm7( a3 * b2 );
7305 SIMDType xmm8( a4 * b2 );
7307 for( ++k; k<kend; ++k ) {
7312 b1 =
set( B(k,j ) );
7313 b2 =
set( B(k,j+1UL) );
7324 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7328 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm5 * factor );
7330 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm7 * factor );
7331 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) + xmm8 * factor );
7337 const size_t kbegin( ( IsLower_v<MT5> )
7338 ?( ( IsUpper_v<MT4> )
7339 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7340 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7341 :( IsUpper_v<MT4> ? i : 0UL ) );
7342 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
7348 SIMDType b1(
set( B(k,j) ) );
7349 SIMDType xmm1( A.load(i ,k) * b1 );
7350 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
7351 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
7352 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
7354 for( ++k; k<kend; ++k ) {
7356 xmm1 += A.load(i ,k) * b1;
7357 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7358 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7359 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
7362 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7374 for( ; (j+2UL) <= N; j+=2UL )
7376 const size_t kbegin( ( IsLower_v<MT5> )
7377 ?( ( IsUpper_v<MT4> )
7378 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7379 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7380 :( IsUpper_v<MT4> ? i : 0UL ) );
7381 const size_t kend( ( IsUpper_v<MT5> )
7382 ?( ( IsLower_v<MT4> )
7383 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7384 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7385 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
7391 SIMDType a1( A.load(i ,k) );
7392 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
7393 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
7394 SIMDType b1(
set( B(k,j ) ) );
7395 SIMDType b2(
set( B(k,j+1UL) ) );
7396 SIMDType xmm1( a1 * b1 );
7397 SIMDType xmm2( a2 * b1 );
7398 SIMDType xmm3( a3 * b1 );
7399 SIMDType xmm4( a1 * b2 );
7400 SIMDType xmm5( a2 * b2 );
7401 SIMDType xmm6( a3 * b2 );
7403 for( ++k; k<kend; ++k ) {
7407 b1 =
set( B(k,j ) );
7408 b2 =
set( B(k,j+1UL) );
7417 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7420 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm4 * factor );
7422 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) + xmm6 * factor );
7428 const size_t kbegin( ( IsLower_v<MT5> )
7429 ?( ( IsUpper_v<MT4> )
7430 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7431 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7432 :( IsUpper_v<MT4> ? i : 0UL ) );
7433 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
7439 SIMDType b1(
set( B(k,j) ) );
7440 SIMDType xmm1( A.load(i ,k) * b1 );
7441 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
7442 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
7444 for( ++k; k<kend; ++k ) {
7446 xmm1 += A.load(i ,k) * b1;
7447 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
7448 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
7451 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7460 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
7461 size_t j( UPP ? i : 0UL );
7463 for( ; (j+4UL) <= jend; j+=4UL )
7465 const size_t kbegin( ( IsLower_v<MT5> )
7466 ?( ( IsUpper_v<MT4> )
7467 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7468 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7469 :( IsUpper_v<MT4> ? i : 0UL ) );
7470 const size_t kend( ( IsUpper_v<MT5> )
7471 ?( ( IsLower_v<MT4> )
7472 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
7473 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
7474 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7480 SIMDType a1( A.load(i ,k) );
7481 SIMDType a2( A.load(i+
SIMDSIZE,k) );
7482 SIMDType b1(
set( B(k,j ) ) );
7483 SIMDType b2(
set( B(k,j+1UL) ) );
7484 SIMDType b3(
set( B(k,j+2UL) ) );
7485 SIMDType b4(
set( B(k,j+3UL) ) );
7486 SIMDType xmm1( a1 * b1 );
7487 SIMDType xmm2( a2 * b1 );
7488 SIMDType xmm3( a1 * b2 );
7489 SIMDType xmm4( a2 * b2 );
7490 SIMDType xmm5( a1 * b3 );
7491 SIMDType xmm6( a2 * b3 );
7492 SIMDType xmm7( a1 * b4 );
7493 SIMDType xmm8( a2 * b4 );
7495 for( ++k; k<kend; ++k ) {
7498 b1 =
set( B(k,j ) );
7499 b2 =
set( B(k,j+1UL) );
7500 b3 =
set( B(k,j+2UL) );
7501 b4 =
set( B(k,j+3UL) );
7512 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7514 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7516 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7518 C.store( i , j+3UL, C.load(i ,j+3UL) + xmm7 * factor );
7523 for( ; (j+3UL) <= jend; j+=3UL )
7525 const size_t kbegin( ( IsLower_v<MT5> )
7526 ?( ( IsUpper_v<MT4> )
7527 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7528 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7529 :( IsUpper_v<MT4> ? i : 0UL ) );
7530 const size_t kend( ( IsUpper_v<MT5> )
7531 ?( ( IsLower_v<MT4> )
7532 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
7533 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
7534 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7540 SIMDType a1( A.load(i ,k) );
7541 SIMDType a2( A.load(i+
SIMDSIZE,k) );
7542 SIMDType b1(
set( B(k,j ) ) );
7543 SIMDType b2(
set( B(k,j+1UL) ) );
7544 SIMDType b3(
set( B(k,j+2UL) ) );
7545 SIMDType xmm1( a1 * b1 );
7546 SIMDType xmm2( a2 * b1 );
7547 SIMDType xmm3( a1 * b2 );
7548 SIMDType xmm4( a2 * b2 );
7549 SIMDType xmm5( a1 * b3 );
7550 SIMDType xmm6( a2 * b3 );
7552 for( ++k; k<kend; ++k ) {
7555 b1 =
set( B(k,j ) );
7556 b2 =
set( B(k,j+1UL) );
7557 b3 =
set( B(k,j+2UL) );
7566 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7568 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7570 C.store( i , j+2UL, C.load(i ,j+2UL) + xmm5 * factor );
7575 for( ; (j+2UL) <= jend; j+=2UL )
7577 const size_t kbegin( ( IsLower_v<MT5> )
7578 ?( ( IsUpper_v<MT4> )
7579 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7580 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7581 :( IsUpper_v<MT4> ? i : 0UL ) );
7582 const size_t kend( ( IsUpper_v<MT5> )
7583 ?( ( IsLower_v<MT4> )
7584 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
7585 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
7586 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
7592 SIMDType a1( A.load(i ,k) );
7593 SIMDType a2( A.load(i+
SIMDSIZE,k) );
7594 SIMDType b1(
set( B(k,j ) ) );
7595 SIMDType b2(
set( B(k,j+1UL) ) );
7596 SIMDType xmm1( a1 * b1 );
7597 SIMDType xmm2( a2 * b1 );
7598 SIMDType xmm3( a1 * b2 );
7599 SIMDType xmm4( a2 * b2 );
7601 for( ++k; k<kend; ++k ) {
7604 b1 =
set( B(k,j ) );
7605 b2 =
set( B(k,j+1UL) );
7612 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7614 C.store( i , j+1UL, C.load(i ,j+1UL) + xmm3 * factor );
7621 const size_t kbegin( ( IsLower_v<MT5> )
7622 ?( ( IsUpper_v<MT4> )
7623 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7624 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7625 :( IsUpper_v<MT4> ? i : 0UL ) );
7626 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
7632 SIMDType b1(
set( B(k,j) ) );
7633 SIMDType xmm1( A.load(i ,k) * b1 );
7634 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
7636 for( ++k; k<kend; ++k ) {
7638 xmm1 += A.load(i ,k) * b1;
7642 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7650 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
7651 size_t j( UPP ? i : 0UL );
7653 for( ; (j+4UL) <= jend; j+=4UL )
7655 const size_t kbegin( ( IsLower_v<MT5> )
7656 ?( ( IsUpper_v<MT4> )
7657 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7658 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7659 :( IsUpper_v<MT4> ? i : 0UL ) );
7660 const size_t kend( ( IsUpper_v<MT5> )
7661 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
7668 SIMDType a1( A.load(i,k) );
7669 SIMDType xmm1( a1 *
set( B(k,j ) ) );
7670 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
7671 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
7672 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
7674 for( ++k; k<kend; ++k ) {
7676 xmm1 += a1 *
set( B(k,j ) );
7677 xmm2 += a1 *
set( B(k,j+1UL) );
7678 xmm3 += a1 *
set( B(k,j+2UL) );
7679 xmm4 += a1 *
set( B(k,j+3UL) );
7682 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7683 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7684 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
7685 C.store( i, j+3UL, C.load(i,j+3UL) + xmm4 * factor );
7689 for( ; (j+3UL) <= jend; j+=3UL )
7691 const size_t kbegin( ( IsLower_v<MT5> )
7692 ?( ( IsUpper_v<MT4> )
7693 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7694 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7695 :( IsUpper_v<MT4> ? i : 0UL ) );
7696 const size_t kend( ( IsUpper_v<MT5> )
7697 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
7704 SIMDType a1( A.load(i,k) );
7705 SIMDType xmm1( a1 *
set( B(k,j ) ) );
7706 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
7707 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
7709 for( ++k; k<kend; ++k ) {
7711 xmm1 += a1 *
set( B(k,j ) );
7712 xmm2 += a1 *
set( B(k,j+1UL) );
7713 xmm3 += a1 *
set( B(k,j+2UL) );
7716 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7717 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7718 C.store( i, j+2UL, C.load(i,j+2UL) + xmm3 * factor );
7722 for( ; (j+2UL) <= jend; j+=2UL )
7724 const size_t kbegin( ( IsLower_v<MT5> )
7725 ?( ( IsUpper_v<MT4> )
7726 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7727 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7728 :( IsUpper_v<MT4> ? i : 0UL ) );
7729 const size_t kend( ( IsUpper_v<MT5> )
7730 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7737 SIMDType a1( A.load(i,k) );
7738 SIMDType xmm1( a1 *
set( B(k,j ) ) );
7739 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
7741 for( ++k; k<kend; ++k ) {
7743 xmm1 += a1 *
set( B(k,j ) );
7744 xmm2 += a1 *
set( B(k,j+1UL) );
7747 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7748 C.store( i, j+1UL, C.load(i,j+1UL) + xmm2 * factor );
7754 const size_t kbegin( ( IsLower_v<MT5> )
7755 ?( ( IsUpper_v<MT4> )
7756 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7757 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7758 :( IsUpper_v<MT4> ? i : 0UL ) );
7764 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
7766 for( ++k; k<K; ++k ) {
7767 xmm1 += A.load(i,k) *
set( B(k,j) );
7770 C.store( i, j, C.load(i,j) + xmm1 * factor );
7775 for( ; remainder && i<M; ++i )
7777 const size_t jend( LOW ? i+1UL : N );
7778 size_t j( UPP ? i : 0UL );
7780 for( ; (j+2UL) <= jend; j+=2UL )
7782 const size_t kbegin( ( IsLower_v<MT5> )
7783 ?( ( IsUpper_v<MT4> )
7784 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7785 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7786 :( IsUpper_v<MT4> ? i : 0UL ) );
7787 const size_t kend( ( IsUpper_v<MT5> )
7788 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
7798 for( ++k; k<kend; ++k ) {
7799 value1 += A(i,k) * B(k,j );
7800 value2 += A(i,k) * B(k,j+1UL);
7803 C(i,j ) += value1 * scalar;
7804 C(i,j+1UL) += value2 * scalar;
7810 const size_t kbegin( ( IsLower_v<MT5> )
7811 ?( ( IsUpper_v<MT4> )
7812 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
7813 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
7814 :( IsUpper_v<MT4> ? i : 0UL ) );
7822 for( ++k; k<K; ++k ) {
7823 value += A(i,k) * B(k,j);
7826 C(i,j) += value * scalar;
7847 template<
typename MT3
7851 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7852 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7854 selectDefaultAddAssignKernel( C, A, B, scalar );
7873 template<
typename MT3
7877 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7878 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7881 lmmm( C, A, B, scalar, ST2(1) );
7883 ummm( C, A, B, scalar, ST2(1) );
7885 mmm( C, A, B, scalar, ST2(1) );
7904 template<
typename MT3
7908 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7909 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7911 selectLargeAddAssignKernel( C, A, B, scalar );
7916#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7930 template<
typename MT3
7934 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7935 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7937 using ET = ElementType_t<MT3>;
7939 if( IsTriangular_v<MT4> ) {
7940 ResultType_t<MT3> tmp(
serial( B ) );
7941 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7942 addAssign( C, tmp );
7944 else if( IsTriangular_v<MT5> ) {
7945 ResultType_t<MT3> tmp(
serial( A ) );
7946 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7947 addAssign( C, tmp );
7950 gemm( C, A, B,
ET(scalar),
ET(1) );
7971 template<
typename MT >
7973 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7982 const ForwardFunctor fwd;
7984 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
7985 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
7987 addAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
8007 template<
typename MT
8009 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8010 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8017 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8018 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8020 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
8034 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
8049 template<
typename MT3
8053 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8055 if( ( IsDiagonal_v<MT4> ) ||
8056 ( !BLAZE_DEBUG_MODE && A.rows() <=
SIMDSIZE*10UL ) ||
8057 ( C.rows() * C.columns() < TDMATTDMATMULT_THRESHOLD ) )
8058 selectSmallSubAssignKernel( C, A, B, scalar );
8060 selectBlasSubAssignKernel( C, A, B, scalar );
8078 template<
typename MT3
8082 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8083 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8086 subAssign( C, tmp );
8104 template<
typename MT3
8108 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8109 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8113 const size_t M( A.rows() );
8114 const size_t N( B.columns() );
8116 for(
size_t j=0UL; j<N; ++j )
8118 const size_t ibegin( ( IsLower_v<MT4> )
8119 ?( IsStrictlyLower_v<MT4> ? j+1UL : j )
8121 const size_t iend( ( IsUpper_v<MT4> )
8122 ?( IsStrictlyUpper_v<MT4> ? j : j+1UL )
8126 const size_t inum( iend - ibegin );
8127 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
8130 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
8131 C(i ,j) -= A(i ,j) * B(j,j) * scalar;
8132 C(i+1UL,j) -= A(i+1UL,j) * B(j,j) * scalar;
8135 C(ipos,j) -= A(ipos,j) * B(j,j) * scalar;
8155 template<
typename MT3
8159 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8160 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8164 const size_t M( A.rows() );
8165 const size_t N( B.columns() );
8167 for(
size_t j=0UL; j<N; ++j )
8169 const size_t ibegin( ( IsLower_v<MT5> )
8170 ?( IsStrictlyLower_v<MT5> ? j+1UL : j )
8172 const size_t iend( ( IsUpper_v<MT5> )
8173 ?( IsStrictlyUpper_v<MT5> ? j : j+1UL )
8177 const size_t inum( iend - ibegin );
8178 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
8181 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
8182 C(i ,j) -= A(i ,i ) * B(i ,j) * scalar;
8183 C(i+1UL,j) -= A(i+1UL,i+1UL) * B(i+1UL,j) * scalar;
8186 C(ipos,j) -= A(ipos,ipos) * B(ipos,j) * scalar;
8206 template<
typename MT3
8210 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8211 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8215 for(
size_t i=0UL; i<A.rows(); ++i ) {
8216 C(i,i) -= A(i,i) * B(i,i) * scalar;
8235 template<
typename MT3
8239 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8240 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8242 selectDefaultSubAssignKernel( C, A, B, scalar );
8261 template<
typename MT3
8265 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8266 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8273 const ForwardFunctor fwd;
8275 if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8276 const OppositeType_t<MT5> tmp(
serial( B ) );
8277 subAssign( C, fwd( A * tmp ) * scalar );
8279 else if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8280 const OppositeType_t<MT4> tmp(
serial( A ) );
8281 subAssign( C, fwd( tmp * B ) * scalar );
8283 else if( B.rows() * B.columns() <= A.rows() * A.columns() ) {
8284 const OppositeType_t<MT5> tmp(
serial( B ) );
8285 subAssign( C, fwd( A * tmp ) * scalar );
8288 const OppositeType_t<MT4> tmp(
serial( A ) );
8289 subAssign( C, fwd( tmp * B ) * scalar );
8309 template<
typename MT3
8313 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8314 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8316 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT4> );
8318 const size_t M( A.rows() );
8319 const size_t N( B.columns() );
8320 const size_t K( A.columns() );
8327 const SIMDType factor(
set( scalar ) );
8331 if( IsIntegral_v<ElementType> )
8334 for(
size_t j=0UL; j<N; ++j )
8336 const size_t kbegin( ( IsLower_v<MT5> )
8337 ?( ( IsUpper_v<MT4> )
8338 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8339 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8340 :( IsUpper_v<MT4> ? i : 0UL ) );
8341 const size_t kend( ( IsUpper_v<MT5> )
8342 ?( ( IsLower_v<MT4> )
8343 ?(
min( i+
SIMDSIZE*8UL, K, ( IsStrictlyUpper_v<MT5> ? j : j+1UL ) ) )
8344 :( IsStrictlyUpper_v<MT5> ? j : j+1UL ) )
8345 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*8UL, K ) : K ) );
8351 SIMDType b1(
set( B(k,j) ) );
8352 SIMDType xmm1( A.load(i ,k) * b1 );
8353 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
8354 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
8355 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
8356 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
8357 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,k) * b1 );
8358 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,k) * b1 );
8359 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,k) * b1 );
8361 for( ++k; k<kend; ++k ) {
8363 xmm1 += A.load(i ,k) * b1;
8364 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8365 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8366 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8367 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8368 xmm6 += A.load(i+
SIMDSIZE*5UL,k) * b1;
8369 xmm7 += A.load(i+
SIMDSIZE*6UL,k) * b1;
8370 xmm8 += A.load(i+
SIMDSIZE*7UL,k) * b1;
8373 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8390 for( ; (j+2UL) <= N; j+=2UL )
8392 const size_t kbegin( ( IsLower_v<MT5> )
8393 ?( ( IsUpper_v<MT4> )
8394 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8395 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8396 :( IsUpper_v<MT4> ? i : 0UL ) );
8397 const size_t kend( ( IsUpper_v<MT5> )
8398 ?( ( IsLower_v<MT4> )
8399 ?(
min( i+
SIMDSIZE*5UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8400 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8401 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*5UL, K ) : K ) );
8407 SIMDType a1( A.load(i ,k) );
8408 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8409 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8410 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8411 SIMDType a5( A.load(i+
SIMDSIZE*4UL,k) );
8412 SIMDType b1(
set( B(k,j ) ) );
8413 SIMDType b2(
set( B(k,j+1UL) ) );
8414 SIMDType xmm1 ( a1 * b1 );
8415 SIMDType xmm2 ( a2 * b1 );
8416 SIMDType xmm3 ( a3 * b1 );
8417 SIMDType xmm4 ( a4 * b1 );
8418 SIMDType xmm5 ( a5 * b1 );
8419 SIMDType xmm6 ( a1 * b2 );
8420 SIMDType xmm7 ( a2 * b2 );
8421 SIMDType xmm8 ( a3 * b2 );
8422 SIMDType xmm9 ( a4 * b2 );
8423 SIMDType xmm10( a5 * b2 );
8425 for( ++k; k<kend; ++k ) {
8431 b1 =
set( B(k,j ) );
8432 b2 =
set( B(k,j+1UL) );
8445 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8450 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm6 * factor );
8452 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm8 * factor );
8453 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm9 * factor );
8454 C.store( i+
SIMDSIZE*4UL, j+1UL, C.load(i+
SIMDSIZE*4UL,j+1UL) - xmm10 * factor );
8460 const size_t kbegin( ( IsLower_v<MT5> )
8461 ?( ( IsUpper_v<MT4> )
8462 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8463 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8464 :( IsUpper_v<MT4> ? i : 0UL ) );
8465 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*5UL, K ) ):( K ) );
8471 SIMDType b1(
set( B(k,j) ) );
8472 SIMDType xmm1( A.load(i ,k) * b1 );
8473 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
8474 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
8475 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
8476 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,k) * b1 );
8478 for( ++k; k<kend; ++k ) {
8480 xmm1 += A.load(i ,k) * b1;
8481 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8482 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8483 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8484 xmm5 += A.load(i+
SIMDSIZE*4UL,k) * b1;
8487 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8500 for( ; (j+2UL) <= N; j+=2UL )
8502 const size_t kbegin( ( IsLower_v<MT5> )
8503 ?( ( IsUpper_v<MT4> )
8504 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8505 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8506 :( IsUpper_v<MT4> ? i : 0UL ) );
8507 const size_t kend( ( IsUpper_v<MT5> )
8508 ?( ( IsLower_v<MT4> )
8509 ?(
min( i+
SIMDSIZE*4UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8510 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8511 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*4UL, K ) : K ) );
8517 SIMDType a1( A.load(i ,k) );
8518 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8519 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8520 SIMDType a4( A.load(i+
SIMDSIZE*3UL,k) );
8521 SIMDType b1(
set( B(k,j ) ) );
8522 SIMDType b2(
set( B(k,j+1UL) ) );
8523 SIMDType xmm1( a1 * b1 );
8524 SIMDType xmm2( a2 * b1 );
8525 SIMDType xmm3( a3 * b1 );
8526 SIMDType xmm4( a4 * b1 );
8527 SIMDType xmm5( a1 * b2 );
8528 SIMDType xmm6( a2 * b2 );
8529 SIMDType xmm7( a3 * b2 );
8530 SIMDType xmm8( a4 * b2 );
8532 for( ++k; k<kend; ++k ) {
8537 b1 =
set( B(k,j ) );
8538 b2 =
set( B(k,j+1UL) );
8549 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8553 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm5 * factor );
8555 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm7 * factor );
8556 C.store( i+
SIMDSIZE*3UL, j+1UL, C.load(i+
SIMDSIZE*3UL,j+1UL) - xmm8 * factor );
8562 const size_t kbegin( ( IsLower_v<MT5> )
8563 ?( ( IsUpper_v<MT4> )
8564 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8565 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8566 :( IsUpper_v<MT4> ? i : 0UL ) );
8567 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*4UL, K ) ):( K ) );
8573 SIMDType b1(
set( B(k,j) ) );
8574 SIMDType xmm1( A.load(i ,k) * b1 );
8575 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
8576 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
8577 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,k) * b1 );
8579 for( ++k; k<kend; ++k ) {
8581 xmm1 += A.load(i ,k) * b1;
8582 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8583 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8584 xmm4 += A.load(i+
SIMDSIZE*3UL,k) * b1;
8587 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8599 for( ; (j+2UL) <= N; j+=2UL )
8601 const size_t kbegin( ( IsLower_v<MT5> )
8602 ?( ( IsUpper_v<MT4> )
8603 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8604 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8605 :( IsUpper_v<MT4> ? i : 0UL ) );
8606 const size_t kend( ( IsUpper_v<MT5> )
8607 ?( ( IsLower_v<MT4> )
8608 ?(
min( i+
SIMDSIZE*3UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8609 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8610 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*3UL, K ) : K ) );
8616 SIMDType a1( A.load(i ,k) );
8617 SIMDType a2( A.load(i+
SIMDSIZE ,k) );
8618 SIMDType a3( A.load(i+
SIMDSIZE*2UL,k) );
8619 SIMDType b1(
set( B(k,j ) ) );
8620 SIMDType b2(
set( B(k,j+1UL) ) );
8621 SIMDType xmm1( a1 * b1 );
8622 SIMDType xmm2( a2 * b1 );
8623 SIMDType xmm3( a3 * b1 );
8624 SIMDType xmm4( a1 * b2 );
8625 SIMDType xmm5( a2 * b2 );
8626 SIMDType xmm6( a3 * b2 );
8628 for( ++k; k<kend; ++k ) {
8632 b1 =
set( B(k,j ) );
8633 b2 =
set( B(k,j+1UL) );
8642 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8645 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm4 * factor );
8647 C.store( i+
SIMDSIZE*2UL, j+1UL, C.load(i+
SIMDSIZE*2UL,j+1UL) - xmm6 * factor );
8653 const size_t kbegin( ( IsLower_v<MT5> )
8654 ?( ( IsUpper_v<MT4> )
8655 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8656 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8657 :( IsUpper_v<MT4> ? i : 0UL ) );
8658 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*3UL, K ) ):( K ) );
8664 SIMDType b1(
set( B(k,j) ) );
8665 SIMDType xmm1( A.load(i ,k) * b1 );
8666 SIMDType xmm2( A.load(i+
SIMDSIZE ,k) * b1 );
8667 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,k) * b1 );
8669 for( ++k; k<kend; ++k ) {
8671 xmm1 += A.load(i ,k) * b1;
8672 xmm2 += A.load(i+
SIMDSIZE ,k) * b1;
8673 xmm3 += A.load(i+
SIMDSIZE*2UL,k) * b1;
8676 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8685 const size_t jend( LOW ?
min(i+
SIMDSIZE*2UL,N) : N );
8686 size_t j( UPP ? i : 0UL );
8688 for( ; (j+4UL) <= jend; j+=4UL )
8690 const size_t kbegin( ( IsLower_v<MT5> )
8691 ?( ( IsUpper_v<MT4> )
8692 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8693 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8694 :( IsUpper_v<MT4> ? i : 0UL ) );
8695 const size_t kend( ( IsUpper_v<MT5> )
8696 ?( ( IsLower_v<MT4> )
8697 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) ) )
8698 :( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL ) )
8699 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8705 SIMDType a1( A.load(i ,k) );
8706 SIMDType a2( A.load(i+
SIMDSIZE,k) );
8707 SIMDType b1(
set( B(k,j ) ) );
8708 SIMDType b2(
set( B(k,j+1UL) ) );
8709 SIMDType b3(
set( B(k,j+2UL) ) );
8710 SIMDType b4(
set( B(k,j+3UL) ) );
8711 SIMDType xmm1( a1 * b1 );
8712 SIMDType xmm2( a2 * b1 );
8713 SIMDType xmm3( a1 * b2 );
8714 SIMDType xmm4( a2 * b2 );
8715 SIMDType xmm5( a1 * b3 );
8716 SIMDType xmm6( a2 * b3 );
8717 SIMDType xmm7( a1 * b4 );
8718 SIMDType xmm8( a2 * b4 );
8720 for( ++k; k<kend; ++k ) {
8723 b1 =
set( B(k,j ) );
8724 b2 =
set( B(k,j+1UL) );
8725 b3 =
set( B(k,j+2UL) );
8726 b4 =
set( B(k,j+3UL) );
8737 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8739 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8741 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8743 C.store( i , j+3UL, C.load(i ,j+3UL) - xmm7 * factor );
8748 for( ; (j+3UL) <= jend; j+=3UL )
8750 const size_t kbegin( ( IsLower_v<MT5> )
8751 ?( ( IsUpper_v<MT4> )
8752 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8753 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8754 :( IsUpper_v<MT4> ? i : 0UL ) );
8755 const size_t kend( ( IsUpper_v<MT5> )
8756 ?( ( IsLower_v<MT4> )
8757 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) ) )
8758 :( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL ) )
8759 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8765 SIMDType a1( A.load(i ,k) );
8766 SIMDType a2( A.load(i+
SIMDSIZE,k) );
8767 SIMDType b1(
set( B(k,j ) ) );
8768 SIMDType b2(
set( B(k,j+1UL) ) );
8769 SIMDType b3(
set( B(k,j+2UL) ) );
8770 SIMDType xmm1( a1 * b1 );
8771 SIMDType xmm2( a2 * b1 );
8772 SIMDType xmm3( a1 * b2 );
8773 SIMDType xmm4( a2 * b2 );
8774 SIMDType xmm5( a1 * b3 );
8775 SIMDType xmm6( a2 * b3 );
8777 for( ++k; k<kend; ++k ) {
8780 b1 =
set( B(k,j ) );
8781 b2 =
set( B(k,j+1UL) );
8782 b3 =
set( B(k,j+2UL) );
8791 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8793 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8795 C.store( i , j+2UL, C.load(i ,j+2UL) - xmm5 * factor );
8800 for( ; (j+2UL) <= jend; j+=2UL )
8802 const size_t kbegin( ( IsLower_v<MT5> )
8803 ?( ( IsUpper_v<MT4> )
8804 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8805 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8806 :( IsUpper_v<MT4> ? i : 0UL ) );
8807 const size_t kend( ( IsUpper_v<MT5> )
8808 ?( ( IsLower_v<MT4> )
8809 ?(
min( i+
SIMDSIZE*2UL, K, ( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) ) )
8810 :( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL ) )
8811 :( IsLower_v<MT4> ?
min( i+
SIMDSIZE*2UL, K ) : K ) );
8817 SIMDType a1( A.load(i ,k) );
8818 SIMDType a2( A.load(i+
SIMDSIZE,k) );
8819 SIMDType b1(
set( B(k,j ) ) );
8820 SIMDType b2(
set( B(k,j+1UL) ) );
8821 SIMDType xmm1( a1 * b1 );
8822 SIMDType xmm2( a2 * b1 );
8823 SIMDType xmm3( a1 * b2 );
8824 SIMDType xmm4( a2 * b2 );
8826 for( ++k; k<kend; ++k ) {
8829 b1 =
set( B(k,j ) );
8830 b2 =
set( B(k,j+1UL) );
8837 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8839 C.store( i , j+1UL, C.load(i ,j+1UL) - xmm3 * factor );
8846 const size_t kbegin( ( IsLower_v<MT5> )
8847 ?( ( IsUpper_v<MT4> )
8848 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8849 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8850 :( IsUpper_v<MT4> ? i : 0UL ) );
8851 const size_t kend( ( IsLower_v<MT4> )?(
min( i+
SIMDSIZE*2UL, K ) ):( K ) );
8857 SIMDType b1(
set( B(k,j) ) );
8858 SIMDType xmm1( A.load(i ,k) * b1 );
8859 SIMDType xmm2( A.load(i+
SIMDSIZE,k) * b1 );
8861 for( ++k; k<kend; ++k ) {
8863 xmm1 += A.load(i ,k) * b1;
8867 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8875 const size_t jend( LOW && UPP ?
min(i+
SIMDSIZE,N) : N );
8876 size_t j( UPP ? i : 0UL );
8878 for( ; (j+4UL) <= jend; j+=4UL )
8880 const size_t kbegin( ( IsLower_v<MT5> )
8881 ?( ( IsUpper_v<MT4> )
8882 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8883 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8884 :( IsUpper_v<MT4> ? i : 0UL ) );
8885 const size_t kend( ( IsUpper_v<MT5> )
8886 ?( IsStrictlyUpper_v<MT5> ? j+3UL : j+4UL )
8893 SIMDType a1( A.load(i,k) );
8894 SIMDType xmm1( a1 *
set( B(k,j ) ) );
8895 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
8896 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
8897 SIMDType xmm4( a1 *
set( B(k,j+3UL) ) );
8899 for( ++k; k<kend; ++k ) {
8901 xmm1 += a1 *
set( B(k,j ) );
8902 xmm2 += a1 *
set( B(k,j+1UL) );
8903 xmm3 += a1 *
set( B(k,j+2UL) );
8904 xmm4 += a1 *
set( B(k,j+3UL) );
8907 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8908 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8909 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
8910 C.store( i, j+3UL, C.load(i,j+3UL) - xmm4 * factor );
8914 for( ; (j+3UL) <= jend; j+=3UL )
8916 const size_t kbegin( ( IsLower_v<MT5> )
8917 ?( ( IsUpper_v<MT4> )
8918 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8919 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8920 :( IsUpper_v<MT4> ? i : 0UL ) );
8921 const size_t kend( ( IsUpper_v<MT5> )
8922 ?( IsStrictlyUpper_v<MT5> ? j+2UL : j+3UL )
8929 SIMDType a1( A.load(i,k) );
8930 SIMDType xmm1( a1 *
set( B(k,j ) ) );
8931 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
8932 SIMDType xmm3( a1 *
set( B(k,j+2UL) ) );
8934 for( ++k; k<kend; ++k ) {
8936 xmm1 += a1 *
set( B(k,j ) );
8937 xmm2 += a1 *
set( B(k,j+1UL) );
8938 xmm3 += a1 *
set( B(k,j+2UL) );
8941 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8942 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8943 C.store( i, j+2UL, C.load(i,j+2UL) - xmm3 * factor );
8947 for( ; (j+2UL) <= jend; j+=2UL )
8949 const size_t kbegin( ( IsLower_v<MT5> )
8950 ?( ( IsUpper_v<MT4> )
8951 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8952 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8953 :( IsUpper_v<MT4> ? i : 0UL ) );
8954 const size_t kend( ( IsUpper_v<MT5> )
8955 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
8962 SIMDType a1( A.load(i,k) );
8963 SIMDType xmm1( a1 *
set( B(k,j ) ) );
8964 SIMDType xmm2( a1 *
set( B(k,j+1UL) ) );
8966 for( ++k; k<kend; ++k ) {
8968 xmm1 += a1 *
set( B(k,j ) );
8969 xmm2 += a1 *
set( B(k,j+1UL) );
8972 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8973 C.store( i, j+1UL, C.load(i,j+1UL) - xmm2 * factor );
8979 const size_t kbegin( ( IsLower_v<MT5> )
8980 ?( ( IsUpper_v<MT4> )
8981 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
8982 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
8983 :( IsUpper_v<MT4> ? i : 0UL ) );
8989 SIMDType xmm1( A.load(i,k) *
set( B(k,j) ) );
8991 for( ++k; k<K; ++k ) {
8992 xmm1 += A.load(i,k) *
set( B(k,j) );
8995 C.store( i, j, C.load(i,j) - xmm1 * factor );
9000 for( ; remainder && i<M; ++i )
9002 const size_t jend( LOW ? i+1UL : N );
9003 size_t j( UPP ? i : 0UL );
9005 for( ; (j+2UL) <= jend; j+=2UL )
9007 const size_t kbegin( ( IsLower_v<MT5> )
9008 ?( ( IsUpper_v<MT4> )
9009 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9010 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9011 :( IsUpper_v<MT4> ? i : 0UL ) );
9012 const size_t kend( ( IsUpper_v<MT5> )
9013 ?( IsStrictlyUpper_v<MT5> ? j+1UL : j+2UL )
9023 for( ++k; k<kend; ++k ) {
9024 value1 += A(i,k) * B(k,j );
9025 value2 += A(i,k) * B(k,j+1UL);
9028 C(i,j ) -= value1 * scalar;
9029 C(i,j+1UL) -= value2 * scalar;
9035 const size_t kbegin( ( IsLower_v<MT5> )
9036 ?( ( IsUpper_v<MT4> )
9037 ?(
max( i, ( IsStrictlyLower_v<MT5> ? j+1UL : j ) ) )
9038 :( IsStrictlyLower_v<MT5> ? j+1UL : j ) )
9039 :( IsUpper_v<MT4> ? i : 0UL ) );
9047 for( ++k; k<K; ++k ) {
9048 value += A(i,k) * B(k,j);
9051 C(i,j) -= value * scalar;
9072 template<
typename MT3
9076 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9077 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9079 selectDefaultSubAssignKernel( C, A, B, scalar );
9098 template<
typename MT3
9102 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9103 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9106 lmmm( C, A, B, -scalar, ST2(1) );
9108 ummm( C, A, B, -scalar, ST2(1) );
9110 mmm( C, A, B, -scalar, ST2(1) );
9129 template<
typename MT3
9133 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9134 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9136 selectLargeSubAssignKernel( C, A, B, scalar );
9141#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9155 template<
typename MT3
9159 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9160 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9162 using ET = ElementType_t<MT3>;
9164 if( IsTriangular_v<MT4> ) {
9165 ResultType_t<MT3> tmp(
serial( B ) );
9166 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9167 subAssign( C, tmp );
9169 else if( IsTriangular_v<MT5> ) {
9170 ResultType_t<MT3> tmp(
serial( A ) );
9171 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9172 subAssign( C, tmp );
9175 gemm( C, A, B,
ET(-scalar),
ET(1) );
9195 template<
typename MT >
9197 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9206 const ForwardFunctor fwd;
9208 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9209 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9211 subAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9231 template<
typename MT
9233 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9245 schurAssign( *lhs, tmp );
9276 template<
typename MT
9279 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9286 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9287 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9289 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
9292 else if( left.columns() == 0UL ) {
9326 template<
typename MT
9329 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9333 using TmpType = If_t< SO, ResultType, OppositeType >;
9345 const ForwardFunctor fwd;
9347 const TmpType tmp( rhs );
9366 template<
typename MT >
9368 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9377 const ForwardFunctor fwd;
9379 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9380 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9382 smpAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9401 template<
typename MT
9404 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9411 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9412 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9414 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9447 template<
typename MT >
9449 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9458 const ForwardFunctor fwd;
9460 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9461 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9486 template<
typename MT
9489 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9496 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9497 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9499 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9532 template<
typename MT >
9534 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9543 const ForwardFunctor fwd;
9545 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9546 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9568 template<
typename MT
9648template<
typename MT1
9650inline decltype(
auto)
9655 if( (*lhs).columns() != (*rhs).rows() ) {
9660 return ReturnType( *lhs, *rhs );
9696template<
typename MT1
9702inline decltype(
auto)
declsym(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9710 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9711 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9740template<
typename MT1
9746inline decltype(
auto)
declherm(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9754 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9755 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9784template<
typename MT1
9790inline decltype(
auto)
decllow(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9798 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9799 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9828template<
typename MT1
9833inline decltype(
auto)
declunilow(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9870template<
typename MT1
9875inline decltype(
auto)
declstrlow(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9912template<
typename MT1
9918inline decltype(
auto)
declupp(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9926 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9927 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9956template<
typename MT1
9961inline decltype(
auto)
decluniupp(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
9998template<
typename MT1
10003inline decltype(
auto)
declstrupp(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
10040template<
typename MT1
10046inline decltype(
auto)
decldiag(
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
10054 using ReturnType =
const TDMatTDMatMultExpr<MT1,MT2,SF,HF,true,true>;
10055 return ReturnType( dm.leftOperand(), dm.rightOperand() );
10071template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10072struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
10073 :
public Size<MT1,0UL>
10076template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10077struct Size< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
10078 :
public Size<MT2,1UL>
10094template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10095struct IsAligned< TDMatTDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10096 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsIntegral type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsResizable type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Constraint on the data type.
Constraint on the data type.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-transpose dense matrix multiplications.
Definition: TDMatTDMatMultExpr.h:152
static constexpr bool LOW
Flag for lower matrices.
Definition: TDMatTDMatMultExpr.h:176
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatTDMatMultExpr.h:463
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: TDMatTDMatMultExpr.h:409
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:296
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: TDMatTDMatMultExpr.h:419
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatTDMatMultExpr.h:320
TDMatTDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the TDMatTDMatMultExpr class.
Definition: TDMatTDMatMultExpr.h:329
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:160
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:393
static constexpr bool UPP
Flag for upper matrices.
Definition: TDMatTDMatMultExpr.h:177
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: TDMatTDMatMultExpr.h:344
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:155
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:286
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:289
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatTDMatMultExpr.h:473
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatTDMatMultExpr.h:314
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:495
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:285
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:165
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:170
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatTDMatMultExpr.h:451
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:156
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:429
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatTDMatMultExpr.h:288
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:302
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatTDMatMultExpr.h:287
static constexpr bool SYM
Flag for symmetric matrices.
Definition: TDMatTDMatMultExpr.h:174
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:158
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatTDMatMultExpr.h:299
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: TDMatTDMatMultExpr.h:175
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:157
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDMatTDMatMultExpr.h:439
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatTDMatMultExpr.h:307
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatTDMatMultExpr.h:483
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: TDMatTDMatMultExpr.h:496
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatTDMatMultExpr.h:290
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:293
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatTDMatMultExpr.h:283
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: TDMatTDMatMultExpr.h:159
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) transIf(const DenseMatrix< MT, SO > &dm)
Conditional calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:832
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
typename EnableIf<!Condition, T >::Type DisableIf_t
Auxiliary type for the EnableIf class template.
Definition: EnableIf.h:175
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.