35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_
145template<
typename MT1
152 :
public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
167 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
172 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
176 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
177 static constexpr bool HERM = ( HF && !( LF || UF ) );
178 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
179 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
189 template<
typename T1,
typename T2,
typename T3 >
190 static constexpr bool CanExploitSymmetry_v =
191 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
201 template<
typename T1,
typename T2,
typename T3 >
202 static constexpr bool IsEvaluationRequired_v =
212 template<
typename T1,
typename T2,
typename T3 >
213 static constexpr bool UseBlasKernel_v =
216 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
217 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
218 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
219 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
220 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
221 IsBLASCompatible_v< ElementType_t<T1> > &&
222 IsBLASCompatible_v< ElementType_t<T2> > &&
223 IsBLASCompatible_v< ElementType_t<T3> > &&
234 template<
typename T1,
typename T2,
typename T3 >
235 static constexpr bool UseVectorizedDefaultKernel_v =
236 ( useOptimizedKernels &&
237 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
238 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
239 IsSIMDCombinable_v< ElementType_t<T1>
310 ( !IsDiagonal_v<MT2> &&
311 MT1::simdEnabled && MT2::simdEnabled &&
312 HasSIMDAdd_v<ET1,ET2> &&
313 HasSIMDMult_v<ET1,ET2> );
350 if( IsDiagonal_v<MT1> ) {
353 else if( IsDiagonal_v<MT2> ) {
356 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
357 const size_t begin( ( IsUpper_v<MT1> )
358 ?( ( IsLower_v<MT2> )
359 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
360 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
361 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
362 :( ( IsLower_v<MT2> )
363 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
365 const size_t end( ( IsLower_v<MT1> )
366 ?( ( IsUpper_v<MT2> )
367 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
368 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
369 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
370 :( ( IsUpper_v<MT2> )
371 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
372 :(
lhs_.columns() ) ) );
396 if( i >=
lhs_.rows() ) {
399 if( j >=
rhs_.columns() ) {
411 inline size_t rows() const noexcept {
422 return rhs_.columns();
452 template<
typename T >
453 inline bool canAlias(
const T* alias )
const noexcept {
454 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
464 template<
typename T >
465 inline bool isAliased(
const T* alias )
const noexcept {
466 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
476 return lhs_.isAligned() &&
rhs_.isAligned();
487 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
489 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
490 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
491 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
514 template<
typename MT
524 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
527 else if( rhs.lhs_.columns() == 0UL ) {
542 DMatDMatMultExpr::selectAssignKernel( *lhs, A, B );
558 template<
typename MT3
561 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
563 if( ( IsDiagonal_v<MT5> ) ||
564 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
565 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
566 selectSmallAssignKernel( C, A, B );
568 selectBlasAssignKernel( C, A, B );
587 template<
typename MT3
590 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
591 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
593 const size_t M( A.rows() );
594 const size_t N( B.columns() );
595 const size_t K( A.columns() );
599 for(
size_t i=0UL; i<M; ++i )
601 const size_t kbegin( ( IsUpper_v<MT4> )
602 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
604 const size_t kend( ( IsLower_v<MT4> )
605 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
609 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
610 for(
size_t j=0UL; j<N; ++j ) {
617 const size_t jbegin( ( IsUpper_v<MT5> )
618 ?( ( IsStrictlyUpper_v<MT5> )
619 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
620 :(
UPP ?
max(i,kbegin) : kbegin ) )
621 :(
UPP ? i : 0UL ) );
622 const size_t jend( ( IsLower_v<MT5> )
623 ?( ( IsStrictlyLower_v<MT5> )
624 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
625 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
626 :(
LOW ? i+1UL : N ) );
628 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
629 for(
size_t j=0UL; j<jbegin; ++j ) {
633 else if( IsStrictlyUpper_v<MT5> ) {
636 for(
size_t j=jbegin; j<jend; ++j ) {
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
639 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
640 for(
size_t j=jend; j<N; ++j ) {
644 else if( IsStrictlyLower_v<MT5> ) {
649 for(
size_t k=kbegin+1UL; k<kend; ++k )
651 const size_t jbegin( ( IsUpper_v<MT5> )
652 ?( ( IsStrictlyUpper_v<MT5> )
656 const size_t jend( ( IsLower_v<MT5> )
657 ?( ( IsStrictlyLower_v<MT5> )
658 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
659 :(
LOW ?
min(i+1UL,k) : k ) )
660 :(
LOW ? i+1UL : N ) );
662 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
665 for(
size_t j=jbegin; j<jend; ++j ) {
666 C(i,j) += A(i,k) * B(k,j);
668 if( IsLower_v<MT5> ) {
669 C(i,jend) = A(i,k) * B(k,jend);
675 for(
size_t i=1UL; i<M; ++i ) {
676 for(
size_t j=0UL; j<i; ++j ) {
677 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
699 template<
typename MT3
702 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
703 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
707 const size_t M( A.rows() );
708 const size_t N( B.columns() );
710 for(
size_t i=0UL; i<M; ++i )
712 const size_t jbegin( ( IsUpper_v<MT4> )
713 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
715 const size_t jend( ( IsLower_v<MT4> )
716 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
720 if( IsUpper_v<MT4> ) {
721 for(
size_t j=0UL; j<jbegin; ++j ) {
725 for(
size_t j=jbegin; j<jend; ++j ) {
726 C(i,j) = A(i,j) * B(j,j);
728 if( IsLower_v<MT4> ) {
729 for(
size_t j=jend; j<N; ++j ) {
752 template<
typename MT3
755 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
756 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
760 const size_t M( A.rows() );
761 const size_t N( B.columns() );
763 for(
size_t i=0UL; i<M; ++i )
765 const size_t jbegin( ( IsUpper_v<MT5> )
766 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
768 const size_t jend( ( IsLower_v<MT5> )
769 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
773 if( IsUpper_v<MT5> ) {
774 for(
size_t j=0UL; j<jbegin; ++j ) {
778 for(
size_t j=jbegin; j<jend; ++j ) {
779 C(i,j) = A(i,i) * B(i,j);
781 if( IsLower_v<MT5> ) {
782 for(
size_t j=jend; j<N; ++j ) {
805 template<
typename MT3
808 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
809 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
815 for(
size_t i=0UL; i<A.rows(); ++i ) {
816 C(i,i) = A(i,i) * B(i,i);
835 template<
typename MT3
838 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
839 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
841 selectDefaultAssignKernel( C, A, B );
861 template<
typename MT3
864 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
865 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
867 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
869 const size_t M( A.rows() );
870 const size_t N( B.columns() );
871 const size_t K( A.columns() );
880 if( IsIntegral_v<ElementType> )
883 for(
size_t i=0UL; i<M; ++i )
885 const size_t kbegin( ( IsUpper_v<MT4> )
886 ?( ( IsLower_v<MT5> )
887 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
888 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
889 :( IsLower_v<MT5> ? j : 0UL ) );
890 const size_t kend( ( IsLower_v<MT4> )
891 ?( ( IsUpper_v<MT5> )
892 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
893 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
894 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
910 for( ++k; k<kend; ++k ) {
912 xmm1 += a1 * B.load(k,j );
914 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
915 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
916 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
917 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
918 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
919 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
922 C.store( i, j , xmm1 );
934 C.store( i, j ,
zero );
951 for( ; (i+2UL) <= M; i+=2UL )
953 const size_t kbegin( ( IsUpper_v<MT4> )
954 ?( ( IsLower_v<MT5> )
955 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
956 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
957 :( IsLower_v<MT5> ? j : 0UL ) );
958 const size_t kend( ( IsLower_v<MT4> )
959 ?( ( IsUpper_v<MT5> )
960 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
961 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
962 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
986 for( ++k; k<kend; ++k ) {
988 a2 =
set( A(i+1UL,k) );
1006 C.store( i , j , xmm1 );
1008 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1009 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1010 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
1011 C.store( i+1UL, j , xmm6 );
1012 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
1013 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
1014 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
1015 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
1020 C.store( i , j ,
zero );
1025 C.store( i+1UL, j ,
zero );
1035 const size_t kbegin( ( IsUpper_v<MT4> )
1036 ?( ( IsLower_v<MT5> )
1037 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1038 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1039 :( IsLower_v<MT5> ? j : 0UL ) );
1040 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
1047 SIMDType xmm1( a1 * B.load(k,j ) );
1053 for( ++k; k<kend; ++k ) {
1055 xmm1 += a1 * B.load(k,j );
1056 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1057 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1058 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1059 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
1062 C.store( i, j , xmm1 );
1064 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1065 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1066 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
1071 C.store( i, j ,
zero );
1088 for(
size_t jj=j; jj<jjend; ++jj ) {
1089 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1096 for(
size_t jj=j; jj<jjend; ++jj ) {
1102 for( ; (i+2UL) <= iend; i+=2UL )
1104 const size_t kbegin( ( IsUpper_v<MT4> )
1105 ?( ( IsLower_v<MT5> )
1106 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1107 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1108 :( IsLower_v<MT5> ? j : 0UL ) );
1109 const size_t kend( ( IsLower_v<MT4> )
1110 ?( ( IsUpper_v<MT5> )
1111 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1112 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1113 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1134 for( ++k; k<kend; ++k ) {
1135 a1 =
set( A(i ,k) );
1136 a2 =
set( A(i+1UL,k) );
1151 C.store( i , j , xmm1 );
1153 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1154 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1155 C.store( i+1UL, j , xmm5 );
1156 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1157 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1158 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1163 C.store( i , j ,
zero );
1167 C.store( i+1UL, j ,
zero );
1176 const size_t kbegin( ( IsUpper_v<MT4> )
1177 ?( ( IsLower_v<MT5> )
1178 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1179 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1180 :( IsLower_v<MT5> ? j : 0UL ) );
1181 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1188 SIMDType xmm1( a1 * B.load(k,j ) );
1193 for( ++k; k<kend; ++k ) {
1195 xmm1 += a1 * B.load(k,j );
1196 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1197 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1198 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1201 C.store( i, j , xmm1 );
1203 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1204 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1209 C.store( i, j ,
zero );
1221 for(
size_t jj=j; jj<jjend; ++jj ) {
1236 for(
size_t jj=j; jj<jjend; ++jj ) {
1237 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1244 for(
size_t jj=j; jj<jjend; ++jj ) {
1250 for( ; (i+2UL) <= iend; i+=2UL )
1252 const size_t kbegin( ( IsUpper_v<MT4> )
1253 ?( ( IsLower_v<MT5> )
1254 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1255 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1256 :( IsLower_v<MT5> ? j : 0UL ) );
1257 const size_t kend( ( IsLower_v<MT4> )
1258 ?( ( IsUpper_v<MT5> )
1259 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1260 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1261 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1279 for( ++k; k<kend; ++k ) {
1280 a1 =
set( A(i ,k) );
1281 a2 =
set( A(i+1UL,k) );
1293 C.store( i , j , xmm1 );
1295 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1296 C.store( i+1UL, j , xmm4 );
1297 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1298 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1303 C.store( i , j ,
zero );
1306 C.store( i+1UL, j ,
zero );
1314 const size_t kbegin( ( IsUpper_v<MT4> )
1315 ?( ( IsLower_v<MT5> )
1316 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1317 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1318 :( IsLower_v<MT5> ? j : 0UL ) );
1319 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1326 SIMDType xmm1( a1 * B.load(k,j ) );
1330 for( ++k; k<kend; ++k ) {
1332 xmm1 += a1 * B.load(k,j );
1333 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1334 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1337 C.store( i, j , xmm1 );
1339 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1344 C.store( i, j ,
zero );
1355 for(
size_t jj=j; jj<jjend; ++jj ) {
1370 for(
size_t jj=j; jj<jjend; ++jj ) {
1371 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1378 for(
size_t jj=j; jj<jjend; ++jj ) {
1384 for( ; (i+4UL) <= iend; i+=4UL )
1386 const size_t kbegin( ( IsUpper_v<MT4> )
1387 ?( ( IsLower_v<MT5> )
1388 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1389 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1390 :( IsLower_v<MT5> ? j : 0UL ) );
1391 const size_t kend( ( IsLower_v<MT4> )
1392 ?( ( IsUpper_v<MT5> )
1393 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1394 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1395 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1416 for( ++k; k<kend; ++k ) {
1417 a1 =
set( A(i ,k) );
1418 a2 =
set( A(i+1UL,k) );
1419 a3 =
set( A(i+2UL,k) );
1420 a4 =
set( A(i+3UL,k) );
1433 C.store( i , j , xmm1 );
1435 C.store( i+1UL, j , xmm3 );
1436 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1437 C.store( i+2UL, j , xmm5 );
1438 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1439 C.store( i+3UL, j , xmm7 );
1440 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1445 C.store( i , j ,
zero );
1447 C.store( i+1UL, j ,
zero );
1449 C.store( i+2UL, j ,
zero );
1451 C.store( i+3UL, j ,
zero );
1456 for( ; (i+3UL) <= iend; i+=3UL )
1458 const size_t kbegin( ( IsUpper_v<MT4> )
1459 ?( ( IsLower_v<MT5> )
1460 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1461 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1462 :( IsLower_v<MT5> ? j : 0UL ) );
1463 const size_t kend( ( IsLower_v<MT4> )
1464 ?( ( IsUpper_v<MT5> )
1465 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1466 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1467 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1485 for( ++k; k<kend; ++k ) {
1486 a1 =
set( A(i ,k) );
1487 a2 =
set( A(i+1UL,k) );
1488 a3 =
set( A(i+2UL,k) );
1499 C.store( i , j , xmm1 );
1501 C.store( i+1UL, j , xmm3 );
1502 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1503 C.store( i+2UL, j , xmm5 );
1504 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1509 C.store( i , j ,
zero );
1511 C.store( i+1UL, j ,
zero );
1513 C.store( i+2UL, j ,
zero );
1518 for( ; (i+2UL) <= iend; i+=2UL )
1520 const size_t kbegin( ( IsUpper_v<MT4> )
1521 ?( ( IsLower_v<MT5> )
1522 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1523 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1524 :( IsLower_v<MT5> ? j : 0UL ) );
1525 const size_t kend( ( IsLower_v<MT4> )
1526 ?( ( IsUpper_v<MT5> )
1527 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1528 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1529 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1544 for( ++k; k<kend; ++k ) {
1545 a1 =
set( A(i ,k) );
1546 a2 =
set( A(i+1UL,k) );
1555 C.store( i , j , xmm1 );
1557 C.store( i+1UL, j , xmm3 );
1558 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1563 C.store( i , j ,
zero );
1565 C.store( i+1UL, j ,
zero );
1572 const size_t kbegin( ( IsUpper_v<MT4> )
1573 ?( ( IsLower_v<MT5> )
1574 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1575 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1576 :( IsLower_v<MT5> ? j : 0UL ) );
1577 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1584 SIMDType xmm1( a1 * B.load(k,j ) );
1587 for( ++k; k<kend; ++k ) {
1589 xmm1 += a1 * B.load(k,j );
1593 C.store( i, j , xmm1 );
1599 C.store( i, j ,
zero );
1609 for(
size_t jj=j; jj<jjend; ++jj ) {
1624 for(
size_t jj=j; jj<jjend; ++jj ) {
1625 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1632 for(
size_t jj=j; jj<jjend; ++jj ) {
1638 for( ; (i+4UL) <= iend; i+=4UL )
1640 const size_t kbegin( ( IsUpper_v<MT4> )
1641 ?( ( IsLower_v<MT5> )
1642 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1643 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1644 :( IsLower_v<MT5> ? j : 0UL ) );
1645 const size_t kend( ( IsLower_v<MT4> )
1646 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1659 for( ++k; k<kend; ++k ) {
1661 xmm1 +=
set( A(i ,k) ) * b1;
1662 xmm2 +=
set( A(i+1UL,k) ) * b1;
1663 xmm3 +=
set( A(i+2UL,k) ) * b1;
1664 xmm4 +=
set( A(i+3UL,k) ) * b1;
1667 C.store( i , j, xmm1 );
1668 C.store( i+1UL, j, xmm2 );
1669 C.store( i+2UL, j, xmm3 );
1670 C.store( i+3UL, j, xmm4 );
1675 C.store( i , j,
zero );
1676 C.store( i+1UL, j,
zero );
1677 C.store( i+2UL, j,
zero );
1678 C.store( i+3UL, j,
zero );
1682 for( ; (i+3UL) <= iend; i+=3UL )
1684 const size_t kbegin( ( IsUpper_v<MT4> )
1685 ?( ( IsLower_v<MT5> )
1686 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1687 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1688 :( IsLower_v<MT5> ? j : 0UL ) );
1689 const size_t kend( ( IsLower_v<MT4> )
1690 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1702 for( ++k; k<kend; ++k ) {
1704 xmm1 +=
set( A(i ,k) ) * b1;
1705 xmm2 +=
set( A(i+1UL,k) ) * b1;
1706 xmm3 +=
set( A(i+2UL,k) ) * b1;
1709 C.store( i , j, xmm1 );
1710 C.store( i+1UL, j, xmm2 );
1711 C.store( i+2UL, j, xmm3 );
1721 for( ; (i+2UL) <= iend; i+=2UL )
1723 const size_t kbegin( ( IsUpper_v<MT4> )
1724 ?( ( IsLower_v<MT5> )
1725 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1726 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1727 :( IsLower_v<MT5> ? j : 0UL ) );
1728 const size_t kend( ( IsLower_v<MT4> )
1729 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1740 for( ++k; k<kend; ++k ) {
1742 xmm1 +=
set( A(i ,k) ) * b1;
1743 xmm2 +=
set( A(i+1UL,k) ) * b1;
1746 C.store( i , j, xmm1 );
1747 C.store( i+1UL, j, xmm2 );
1752 C.store( i , j,
zero );
1753 C.store( i+1UL, j,
zero );
1759 const size_t kbegin( ( IsUpper_v<MT4> )
1760 ?( ( IsLower_v<MT5> )
1761 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1762 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1763 :( IsLower_v<MT5> ? j : 0UL ) );
1771 for( ++k; k<K; ++k ) {
1772 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1775 C.store( i, j, xmm1 );
1780 C.store( i, j,
zero );
1789 for(
size_t jj=j; jj<jjend; ++jj ) {
1796 for( ; remainder && j<N; ++j )
1802 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1811 for( ; (i+2UL) <= M; i+=2UL )
1813 const size_t kbegin( ( IsUpper_v<MT4> )
1814 ?( ( IsLower_v<MT5> )
1815 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1816 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1817 :( IsLower_v<MT5> ? j : 0UL ) );
1818 const size_t kend( ( IsLower_v<MT4> )
1819 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1829 for( ++k; k<kend; ++k ) {
1830 value1 += A(i ,k) * B(k,j);
1831 value2 += A(i+1UL,k) * B(k,j);
1835 C(i+1UL,j) = value2;
1840 reset( C(i+1UL,j) );
1846 const size_t kbegin( ( IsUpper_v<MT4> )
1847 ?( ( IsLower_v<MT5> )
1848 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1849 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1850 :( IsLower_v<MT5> ? j : 0UL ) );
1858 for( ++k; k<K; ++k ) {
1859 value += A(i,k) * B(k,j);
1889 template<
typename MT3
1892 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1893 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1900 const ForwardFunctor fwd;
1902 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1903 const OppositeType_t<MT4> tmp(
serial( A ) );
1904 assign( C, fwd( tmp * B ) );
1906 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1907 const OppositeType_t<MT5> tmp(
serial( B ) );
1908 assign( C, fwd( A * tmp ) );
1910 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1911 const OppositeType_t<MT4> tmp(
serial( A ) );
1912 assign( C, fwd( tmp * B ) );
1915 const OppositeType_t<MT5> tmp(
serial( B ) );
1916 assign( C, fwd( A * tmp ) );
1935 template<
typename MT3
1938 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1939 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1941 selectDefaultAssignKernel( C, A, B );
1960 template<
typename MT3
1963 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1964 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1993 template<
typename MT3
1996 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1997 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1999 selectLargeAssignKernel( C, A, B );
2005#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
2018 template<
typename MT3
2021 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2022 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2024 using ET = ElementType_t<MT3>;
2026 if( IsTriangular_v<MT4> ) {
2028 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
2030 else if( IsTriangular_v<MT5> ) {
2032 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
2035 gemm( C, A, B, ET(1), ET(0) );
2055 template<
typename MT
2057 friend inline auto assign( SparseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
2058 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2062 using TmpType = If_t< SO, OppositeType, ResultType >;
2074 const ForwardFunctor fwd;
2076 const TmpType tmp(
serial( rhs ) );
2077 assign( *lhs, fwd( tmp ) );
2097 template<
typename MT >
2098 friend inline auto assign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
2099 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2108 const ForwardFunctor fwd;
2110 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
2111 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
2113 assign( *lhs, fwd( A * B ) );
2131 template<
typename MT
2133 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
2134 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
2141 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
2155 DMatDMatMultExpr::selectAddAssignKernel( *lhs, A, B );
2171 template<
typename MT3
2174 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2176 if( ( IsDiagonal_v<MT5> ) ||
2177 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
2178 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
2179 selectSmallAddAssignKernel( C, A, B );
2181 selectBlasAddAssignKernel( C, A, B );
2200 template<
typename MT3
2203 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2204 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2206 const size_t M( A.rows() );
2207 const size_t N( B.columns() );
2208 const size_t K( A.columns() );
2212 for(
size_t i=0UL; i<M; ++i )
2214 const size_t kbegin( ( IsUpper_v<MT4> )
2215 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2217 const size_t kend( ( IsLower_v<MT4> )
2218 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2222 for(
size_t k=kbegin; k<kend; ++k )
2224 const size_t jbegin( ( IsUpper_v<MT5> )
2225 ?( ( IsStrictlyUpper_v<MT5> )
2226 ?(
UPP ?
max(i,k+1UL) : k+1UL )
2227 :(
UPP ?
max(i,k) : k ) )
2228 :(
UPP ? i : 0UL ) );
2229 const size_t jend( ( IsLower_v<MT5> )
2230 ?( ( IsStrictlyLower_v<MT5> )
2231 ?(
LOW ?
min(i+1UL,k) : k )
2232 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
2233 :(
LOW ? i+1UL : N ) );
2235 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
2238 const size_t jnum( jend - jbegin );
2239 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
2242 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2243 C(i,j ) += A(i,k) * B(k,j );
2244 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
2247 C(i,jpos) += A(i,k) * B(k,jpos);
2269 template<
typename MT3
2272 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2273 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2277 const size_t M( A.rows() );
2278 const size_t N( B.columns() );
2280 for(
size_t i=0UL; i<M; ++i )
2282 const size_t jbegin( ( IsUpper_v<MT4> )
2283 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2285 const size_t jend( ( IsLower_v<MT4> )
2286 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2290 const size_t jnum( jend - jbegin );
2291 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
2294 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2295 C(i,j ) += A(i,j ) * B(j ,j );
2296 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2299 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2320 template<
typename MT3
2323 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2324 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2328 const size_t M( A.rows() );
2329 const size_t N( B.columns() );
2331 for(
size_t i=0UL; i<M; ++i )
2333 const size_t jbegin( ( IsUpper_v<MT5> )
2334 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2336 const size_t jend( ( IsLower_v<MT5> )
2337 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2341 const size_t jnum( jend - jbegin );
2342 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
2345 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2346 C(i,j ) += A(i,i) * B(i,j );
2347 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2350 C(i,jpos) += A(i,i) * B(i,jpos);
2371 template<
typename MT3
2374 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2375 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2379 for(
size_t i=0UL; i<A.rows(); ++i ) {
2380 C(i,i) += A(i,i) * B(i,i);
2400 template<
typename MT3
2403 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2404 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2406 selectDefaultAddAssignKernel( C, A, B );
2426 template<
typename MT3
2429 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2430 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2432 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2434 const size_t M( A.rows() );
2435 const size_t N( B.columns() );
2436 const size_t K( A.columns() );
2445 if( IsIntegral_v<ElementType> )
2448 for(
size_t i=0UL; i<M; ++i )
2450 const size_t kbegin( ( IsUpper_v<MT4> )
2451 ?( ( IsLower_v<MT5> )
2452 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2453 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2454 :( IsLower_v<MT5> ? j : 0UL ) );
2455 const size_t kend( ( IsLower_v<MT4> )
2456 ?( ( IsUpper_v<MT5> )
2457 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
2458 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2459 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
2470 for(
size_t k=kbegin; k<kend; ++k ) {
2472 xmm1 += a1 * B.load(k,j );
2473 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2474 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2475 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2476 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2477 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
2478 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
2479 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
2482 C.store( i, j , xmm1 );
2484 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2485 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2486 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2487 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
2488 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
2489 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
2498 for( ; (i+2UL) <= M; i+=2UL )
2500 const size_t kbegin( ( IsUpper_v<MT4> )
2501 ?( ( IsLower_v<MT5> )
2502 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2503 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2504 :( IsLower_v<MT5> ? j : 0UL ) );
2505 const size_t kend( ( IsLower_v<MT4> )
2506 ?( ( IsUpper_v<MT5> )
2507 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
2508 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2509 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
2516 SIMDType xmm6 ( C.load(i+1UL,j ) );
2522 for(
size_t k=kbegin; k<kend; ++k ) {
2542 C.store( i , j , xmm1 );
2544 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2545 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2546 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
2547 C.store( i+1UL, j , xmm6 );
2548 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
2549 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
2550 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
2551 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
2556 const size_t kbegin( ( IsUpper_v<MT4> )
2557 ?( ( IsLower_v<MT5> )
2558 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2559 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2560 :( IsLower_v<MT5> ? j : 0UL ) );
2561 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
2569 for(
size_t k=kbegin; k<kend; ++k ) {
2571 xmm1 += a1 * B.load(k,j );
2572 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2573 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2574 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2575 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2578 C.store( i, j , xmm1 );
2580 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2581 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2582 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2590 for( ; (i+2UL) <= M; i+=2UL )
2592 const size_t kbegin( ( IsUpper_v<MT4> )
2593 ?( ( IsLower_v<MT5> )
2594 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2595 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2596 :( IsLower_v<MT5> ? j : 0UL ) );
2597 const size_t kend( ( IsLower_v<MT4> )
2598 ?( ( IsUpper_v<MT5> )
2599 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
2600 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2601 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
2612 for(
size_t k=kbegin; k<kend; ++k ) {
2629 C.store( i , j , xmm1 );
2631 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2632 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2633 C.store( i+1UL, j , xmm5 );
2634 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
2635 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
2636 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
2641 const size_t kbegin( ( IsUpper_v<MT4> )
2642 ?( ( IsLower_v<MT5> )
2643 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2644 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2645 :( IsLower_v<MT5> ? j : 0UL ) );
2646 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
2653 for(
size_t k=kbegin; k<kend; ++k ) {
2655 xmm1 += a1 * B.load(k,j );
2656 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2657 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2658 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2661 C.store( i, j , xmm1 );
2663 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2664 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2672 for( ; (i+2UL) <= M; i+=2UL )
2674 const size_t kbegin( ( IsUpper_v<MT4> )
2675 ?( ( IsLower_v<MT5> )
2676 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2677 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2678 :( IsLower_v<MT5> ? j : 0UL ) );
2679 const size_t kend( ( IsLower_v<MT4> )
2680 ?( ( IsUpper_v<MT5> )
2681 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
2682 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2683 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
2692 for(
size_t k=kbegin; k<kend; ++k ) {
2706 C.store( i , j , xmm1 );
2708 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2709 C.store( i+1UL, j , xmm4 );
2710 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
2711 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
2716 const size_t kbegin( ( IsUpper_v<MT4> )
2717 ?( ( IsLower_v<MT5> )
2718 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2719 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2720 :( IsLower_v<MT5> ? j : 0UL ) );
2721 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
2727 for(
size_t k=kbegin; k<kend; ++k ) {
2729 xmm1 += a1 * B.load(k,j );
2730 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2731 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2734 C.store( i, j , xmm1 );
2736 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2743 size_t i(
LOW ? j : 0UL );
2745 for( ; (i+4UL) <= iend; i+=4UL )
2747 const size_t kbegin( ( IsUpper_v<MT4> )
2748 ?( ( IsLower_v<MT5> )
2749 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2750 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2751 :( IsLower_v<MT5> ? j : 0UL ) );
2752 const size_t kend( ( IsLower_v<MT4> )
2753 ?( ( IsUpper_v<MT5> )
2754 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
2755 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2756 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2767 for(
size_t k=kbegin; k<kend; ++k ) {
2784 C.store( i , j , xmm1 );
2786 C.store( i+1UL, j , xmm3 );
2787 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2788 C.store( i+2UL, j , xmm5 );
2789 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2790 C.store( i+3UL, j , xmm7 );
2791 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
2794 for( ; (i+3UL) <= iend; i+=3UL )
2796 const size_t kbegin( ( IsUpper_v<MT4> )
2797 ?( ( IsLower_v<MT5> )
2798 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2799 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2800 :( IsLower_v<MT5> ? j : 0UL ) );
2801 const size_t kend( ( IsLower_v<MT4> )
2802 ?( ( IsUpper_v<MT5> )
2803 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
2804 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2805 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2814 for(
size_t k=kbegin; k<kend; ++k ) {
2828 C.store( i , j , xmm1 );
2830 C.store( i+1UL, j , xmm3 );
2831 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2832 C.store( i+2UL, j , xmm5 );
2833 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2836 for( ; (i+2UL) <= iend; i+=2UL )
2838 const size_t kbegin( ( IsUpper_v<MT4> )
2839 ?( ( IsLower_v<MT5> )
2840 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2841 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2842 :( IsLower_v<MT5> ? j : 0UL ) );
2843 const size_t kend( ( IsLower_v<MT4> )
2844 ?( ( IsUpper_v<MT5> )
2845 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
2846 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2847 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2854 for(
size_t k=kbegin; k<kend; ++k ) {
2865 C.store( i , j , xmm1 );
2867 C.store( i+1UL, j , xmm3 );
2868 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2873 const size_t kbegin( ( IsUpper_v<MT4> )
2874 ?( ( IsLower_v<MT5> )
2875 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2876 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2877 :( IsLower_v<MT5> ? j : 0UL ) );
2878 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
2883 for(
size_t k=kbegin; k<kend; ++k ) {
2885 xmm1 += a1 * B.load(k,j );
2889 C.store( i, j , xmm1 );
2897 size_t i(
LOW ? j : 0UL );
2899 for( ; (i+4UL) <= iend; i+=4UL )
2901 const size_t kbegin( ( IsUpper_v<MT4> )
2902 ?( ( IsLower_v<MT5> )
2903 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2904 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2905 :( IsLower_v<MT5> ? j : 0UL ) );
2906 const size_t kend( ( IsLower_v<MT4> )
2907 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2915 for(
size_t k=kbegin; k<kend; ++k ) {
2917 xmm1 +=
set( A(i ,k) ) * b1;
2918 xmm2 +=
set( A(i+1UL,k) ) * b1;
2919 xmm3 +=
set( A(i+2UL,k) ) * b1;
2920 xmm4 +=
set( A(i+3UL,k) ) * b1;
2923 C.store( i , j, xmm1 );
2924 C.store( i+1UL, j, xmm2 );
2925 C.store( i+2UL, j, xmm3 );
2926 C.store( i+3UL, j, xmm4 );
2929 for( ; (i+3UL) <= iend; i+=3UL )
2931 const size_t kbegin( ( IsUpper_v<MT4> )
2932 ?( ( IsLower_v<MT5> )
2933 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2934 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2935 :( IsLower_v<MT5> ? j : 0UL ) );
2936 const size_t kend( ( IsLower_v<MT4> )
2937 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2944 for(
size_t k=kbegin; k<kend; ++k ) {
2946 xmm1 +=
set( A(i ,k) ) * b1;
2947 xmm2 +=
set( A(i+1UL,k) ) * b1;
2948 xmm3 +=
set( A(i+2UL,k) ) * b1;
2951 C.store( i , j, xmm1 );
2952 C.store( i+1UL, j, xmm2 );
2953 C.store( i+2UL, j, xmm3 );
2956 for( ; (i+2UL) <= iend; i+=2UL )
2958 const size_t kbegin( ( IsUpper_v<MT4> )
2959 ?( ( IsLower_v<MT5> )
2960 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2961 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2962 :( IsLower_v<MT5> ? j : 0UL ) );
2963 const size_t kend( ( IsLower_v<MT4> )
2964 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2970 for(
size_t k=kbegin; k<kend; ++k ) {
2972 xmm1 +=
set( A(i ,k) ) * b1;
2973 xmm2 +=
set( A(i+1UL,k) ) * b1;
2976 C.store( i , j, xmm1 );
2977 C.store( i+1UL, j, xmm2 );
2982 const size_t kbegin( ( IsUpper_v<MT4> )
2983 ?( ( IsLower_v<MT5> )
2984 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2985 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2986 :( IsLower_v<MT5> ? j : 0UL ) );
2990 for(
size_t k=kbegin; k<K; ++k ) {
2991 xmm1 +=
set( A(i,k) ) * B.load(k,j);
2994 C.store( i, j, xmm1 );
2998 for( ; remainder && j<N; ++j )
3000 const size_t iend(
UPP ? j+1UL : M );
3001 size_t i(
LOW ? j : 0UL );
3003 for( ; (i+2UL) <= iend; i+=2UL )
3005 const size_t kbegin( ( IsUpper_v<MT4> )
3006 ?( ( IsLower_v<MT5> )
3007 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3008 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3009 :( IsLower_v<MT5> ? j : 0UL ) );
3010 const size_t kend( ( IsLower_v<MT4> )
3011 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3017 for(
size_t k=kbegin; k<kend; ++k ) {
3018 value1 += A(i ,k) * B(k,j);
3019 value2 += A(i+1UL,k) * B(k,j);
3023 C(i+1UL,j) = value2;
3028 const size_t kbegin( ( IsUpper_v<MT4> )
3029 ?( ( IsLower_v<MT5> )
3030 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3031 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3032 :( IsLower_v<MT5> ? j : 0UL ) );
3036 for(
size_t k=kbegin; k<K; ++k ) {
3037 value += A(i,k) * B(k,j);
3062 template<
typename MT3
3065 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3066 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3073 const ForwardFunctor fwd;
3075 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
3076 const OppositeType_t<MT4> tmp(
serial( A ) );
3077 addAssign( C, fwd( tmp * B ) );
3079 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
3080 const OppositeType_t<MT5> tmp(
serial( B ) );
3081 addAssign( C, fwd( A * tmp ) );
3083 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
3084 const OppositeType_t<MT4> tmp(
serial( A ) );
3085 addAssign( C, fwd( tmp * B ) );
3088 const OppositeType_t<MT5> tmp(
serial( B ) );
3089 addAssign( C, fwd( A * tmp ) );
3109 template<
typename MT3
3112 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3113 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3115 selectDefaultAddAssignKernel( C, A, B );
3135 template<
typename MT3
3138 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3139 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3165 template<
typename MT3
3168 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3169 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3171 selectLargeAddAssignKernel( C, A, B );
3177#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
3191 template<
typename MT3
3194 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3195 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3197 using ET = ElementType_t<MT3>;
3199 if( IsTriangular_v<MT4> ) {
3200 ResultType_t<MT3> tmp(
serial( B ) );
3201 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3202 addAssign( C, tmp );
3204 else if( IsTriangular_v<MT5> ) {
3205 ResultType_t<MT3> tmp(
serial( A ) );
3206 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3207 addAssign( C, tmp );
3210 gemm( C, A, B, ET(1), ET(1) );
3232 template<
typename MT >
3233 friend inline auto addAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
3234 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3243 const ForwardFunctor fwd;
3245 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
3246 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
3248 addAssign( *lhs, fwd( A * B ) );
3270 template<
typename MT
3272 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
3273 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3280 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3294 DMatDMatMultExpr::selectSubAssignKernel( *lhs, A, B );
3310 template<
typename MT3
3313 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3315 if( ( IsDiagonal_v<MT5> ) ||
3316 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
3317 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3318 selectSmallSubAssignKernel( C, A, B );
3320 selectBlasSubAssignKernel( C, A, B );
3339 template<
typename MT3
3342 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3343 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3345 const size_t M( A.rows() );
3346 const size_t N( B.columns() );
3347 const size_t K( A.columns() );
3351 for(
size_t i=0UL; i<M; ++i )
3353 const size_t kbegin( ( IsUpper_v<MT4> )
3354 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3356 const size_t kend( ( IsLower_v<MT4> )
3357 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3361 for(
size_t k=kbegin; k<kend; ++k )
3363 const size_t jbegin( ( IsUpper_v<MT5> )
3364 ?( ( IsStrictlyUpper_v<MT5> )
3365 ?(
UPP ?
max(i,k+1UL) : k+1UL )
3366 :(
UPP ?
max(i,k) : k ) )
3367 :(
UPP ? i : 0UL ) );
3368 const size_t jend( ( IsLower_v<MT5> )
3369 ?( ( IsStrictlyLower_v<MT5> )
3370 ?(
LOW ?
min(i+1UL,k) : k )
3371 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
3372 :(
LOW ? i+1UL : N ) );
3374 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
3377 const size_t jnum( jend - jbegin );
3378 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3381 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3382 C(i,j ) -= A(i,k) * B(k,j );
3383 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3386 C(i,jpos) -= A(i,k) * B(k,jpos);
3408 template<
typename MT3
3411 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3412 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3416 const size_t M( A.rows() );
3417 const size_t N( B.columns() );
3419 for(
size_t i=0UL; i<M; ++i )
3421 const size_t jbegin( ( IsUpper_v<MT4> )
3422 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3424 const size_t jend( ( IsLower_v<MT4> )
3425 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3429 const size_t jnum( jend - jbegin );
3430 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3433 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3434 C(i,j ) -= A(i,j ) * B(j ,j );
3435 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3438 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3459 template<
typename MT3
3462 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3463 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3467 const size_t M( A.rows() );
3468 const size_t N( B.columns() );
3470 for(
size_t i=0UL; i<M; ++i )
3472 const size_t jbegin( ( IsUpper_v<MT5> )
3473 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3475 const size_t jend( ( IsLower_v<MT5> )
3476 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3480 const size_t jnum( jend - jbegin );
3481 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
3484 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3485 C(i,j ) -= A(i,i) * B(i,j );
3486 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3489 C(i,jpos) -= A(i,i) * B(i,jpos);
3510 template<
typename MT3
3513 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3514 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3518 for(
size_t i=0UL; i<A.rows(); ++i ) {
3519 C(i,i) -= A(i,i) * B(i,i);
3539 template<
typename MT3
3542 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3543 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3545 selectDefaultSubAssignKernel( C, A, B );
3565 template<
typename MT3
3568 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3569 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3571 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3573 const size_t M( A.rows() );
3574 const size_t N( B.columns() );
3575 const size_t K( A.columns() );
3584 if( IsIntegral_v<ElementType> )
3587 for(
size_t i=0UL; i<M; ++i )
3589 const size_t kbegin( ( IsUpper_v<MT4> )
3590 ?( ( IsLower_v<MT5> )
3591 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3592 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3593 :( IsLower_v<MT5> ? j : 0UL ) );
3594 const size_t kend( ( IsLower_v<MT4> )
3595 ?( ( IsUpper_v<MT5> )
3596 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3597 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3598 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3609 for(
size_t k=kbegin; k<kend; ++k ) {
3611 xmm1 -= a1 * B.load(k,j );
3612 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3613 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3614 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3615 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3616 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
3617 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
3618 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
3621 C.store( i, j , xmm1 );
3623 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3624 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3625 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3626 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3627 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3628 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3637 for( ; (i+2UL) <= M; i+=2UL )
3639 const size_t kbegin( ( IsUpper_v<MT4> )
3640 ?( ( IsLower_v<MT5> )
3641 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3642 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3643 :( IsLower_v<MT5> ? j : 0UL ) );
3644 const size_t kend( ( IsLower_v<MT4> )
3645 ?( ( IsUpper_v<MT5> )
3646 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3647 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3648 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3655 SIMDType xmm6 ( C.load(i+1UL,j ) );
3661 for(
size_t k=kbegin; k<kend; ++k ) {
3681 C.store( i , j , xmm1 );
3683 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3684 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3685 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3686 C.store( i+1UL, j , xmm6 );
3687 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3688 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3689 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3690 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3695 const size_t kbegin( ( IsUpper_v<MT4> )
3696 ?( ( IsLower_v<MT5> )
3697 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3698 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3699 :( IsLower_v<MT5> ? j : 0UL ) );
3700 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3708 for(
size_t k=kbegin; k<kend; ++k ) {
3710 xmm1 -= a1 * B.load(k,j );
3711 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3712 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3713 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3714 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3717 C.store( i, j , xmm1 );
3719 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3720 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3721 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3729 for( ; (i+2UL) <= M; i+=2UL )
3731 const size_t kbegin( ( IsUpper_v<MT4> )
3732 ?( ( IsLower_v<MT5> )
3733 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3734 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3735 :( IsLower_v<MT5> ? j : 0UL ) );
3736 const size_t kend( ( IsLower_v<MT4> )
3737 ?( ( IsUpper_v<MT5> )
3738 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3739 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3740 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3751 for(
size_t k=kbegin; k<kend; ++k ) {
3768 C.store( i , j , xmm1 );
3770 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3771 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3772 C.store( i+1UL, j , xmm5 );
3773 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3774 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3775 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3780 const size_t kbegin( ( IsUpper_v<MT4> )
3781 ?( ( IsLower_v<MT5> )
3782 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3783 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3784 :( IsLower_v<MT5> ? j : 0UL ) );
3785 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3792 for(
size_t k=kbegin; k<kend; ++k ) {
3794 xmm1 -= a1 * B.load(k,j );
3795 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3796 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3797 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3800 C.store( i, j , xmm1 );
3802 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3803 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3811 for( ; (i+2UL) <= M; i+=2UL )
3813 const size_t kbegin( ( IsUpper_v<MT4> )
3814 ?( ( IsLower_v<MT5> )
3815 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3816 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3817 :( IsLower_v<MT5> ? j : 0UL ) );
3818 const size_t kend( ( IsLower_v<MT4> )
3819 ?( ( IsUpper_v<MT5> )
3820 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3821 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3822 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
3831 for(
size_t k=kbegin; k<kend; ++k ) {
3845 C.store( i , j , xmm1 );
3847 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3848 C.store( i+1UL, j , xmm4 );
3849 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
3850 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
3855 const size_t kbegin( ( IsUpper_v<MT4> )
3856 ?( ( IsLower_v<MT5> )
3857 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3858 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3859 :( IsLower_v<MT5> ? j : 0UL ) );
3860 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
3866 for(
size_t k=kbegin; k<kend; ++k ) {
3868 xmm1 -= a1 * B.load(k,j );
3869 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3870 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3873 C.store( i, j , xmm1 );
3875 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3882 size_t i(
LOW ? j : 0UL );
3884 for( ; (i+4UL) <= iend; i+=4UL )
3886 const size_t kbegin( ( IsUpper_v<MT4> )
3887 ?( ( IsLower_v<MT5> )
3888 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3889 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3890 :( IsLower_v<MT5> ? j : 0UL ) );
3891 const size_t kend( ( IsLower_v<MT4> )
3892 ?( ( IsUpper_v<MT5> )
3893 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
3894 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3895 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3906 for(
size_t k=kbegin; k<kend; ++k ) {
3923 C.store( i , j , xmm1 );
3925 C.store( i+1UL, j , xmm3 );
3926 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3927 C.store( i+2UL, j , xmm5 );
3928 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3929 C.store( i+3UL, j , xmm7 );
3930 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
3933 for( ; (i+3UL) <= iend; i+=3UL )
3935 const size_t kbegin( ( IsUpper_v<MT4> )
3936 ?( ( IsLower_v<MT5> )
3937 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3938 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3939 :( IsLower_v<MT5> ? j : 0UL ) );
3940 const size_t kend( ( IsLower_v<MT4> )
3941 ?( ( IsUpper_v<MT5> )
3942 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
3943 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3944 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3953 for(
size_t k=kbegin; k<kend; ++k ) {
3967 C.store( i , j , xmm1 );
3969 C.store( i+1UL, j , xmm3 );
3970 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3971 C.store( i+2UL, j , xmm5 );
3972 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3975 for( ; (i+2UL) <= iend; i+=2UL )
3977 const size_t kbegin( ( IsUpper_v<MT4> )
3978 ?( ( IsLower_v<MT5> )
3979 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3980 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3981 :( IsLower_v<MT5> ? j : 0UL ) );
3982 const size_t kend( ( IsLower_v<MT4> )
3983 ?( ( IsUpper_v<MT5> )
3984 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
3985 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3986 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3993 for(
size_t k=kbegin; k<kend; ++k ) {
4004 C.store( i , j , xmm1 );
4006 C.store( i+1UL, j , xmm3 );
4007 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
4012 const size_t kbegin( ( IsUpper_v<MT4> )
4013 ?( ( IsLower_v<MT5> )
4014 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4015 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4016 :( IsLower_v<MT5> ? j : 0UL ) );
4017 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
4022 for(
size_t k=kbegin; k<kend; ++k ) {
4024 xmm1 -= a1 * B.load(k,j );
4028 C.store( i, j , xmm1 );
4036 size_t i(
LOW ? j : 0UL );
4038 for( ; (i+4UL) <= iend; i+=4UL )
4040 const size_t kbegin( ( IsUpper_v<MT4> )
4041 ?( ( IsLower_v<MT5> )
4042 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4043 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4044 :( IsLower_v<MT5> ? j : 0UL ) );
4045 const size_t kend( ( IsLower_v<MT4> )
4046 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
4054 for(
size_t k=kbegin; k<kend; ++k ) {
4056 xmm1 -=
set( A(i ,k) ) * b1;
4057 xmm2 -=
set( A(i+1UL,k) ) * b1;
4058 xmm3 -=
set( A(i+2UL,k) ) * b1;
4059 xmm4 -=
set( A(i+3UL,k) ) * b1;
4062 C.store( i , j, xmm1 );
4063 C.store( i+1UL, j, xmm2 );
4064 C.store( i+2UL, j, xmm3 );
4065 C.store( i+3UL, j, xmm4 );
4068 for( ; (i+3UL) <= iend; i+=3UL )
4070 const size_t kbegin( ( IsUpper_v<MT4> )
4071 ?( ( IsLower_v<MT5> )
4072 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4073 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4074 :( IsLower_v<MT5> ? j : 0UL ) );
4075 const size_t kend( ( IsLower_v<MT4> )
4076 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
4083 for(
size_t k=kbegin; k<kend; ++k ) {
4085 xmm1 -=
set( A(i ,k) ) * b1;
4086 xmm2 -=
set( A(i+1UL,k) ) * b1;
4087 xmm3 -=
set( A(i+2UL,k) ) * b1;
4090 C.store( i , j, xmm1 );
4091 C.store( i+1UL, j, xmm2 );
4092 C.store( i+2UL, j, xmm3 );
4095 for( ; (i+2UL) <= iend; i+=2UL )
4097 const size_t kbegin( ( IsUpper_v<MT4> )
4098 ?( ( IsLower_v<MT5> )
4099 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4100 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4101 :( IsLower_v<MT5> ? j : 0UL ) );
4102 const size_t kend( ( IsLower_v<MT4> )
4103 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4109 for(
size_t k=kbegin; k<kend; ++k ) {
4111 xmm1 -=
set( A(i ,k) ) * b1;
4112 xmm2 -=
set( A(i+1UL,k) ) * b1;
4115 C.store( i , j, xmm1 );
4116 C.store( i+1UL, j, xmm2 );
4121 const size_t kbegin( ( IsUpper_v<MT4> )
4122 ?( ( IsLower_v<MT5> )
4123 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4124 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4125 :( IsLower_v<MT5> ? j : 0UL ) );
4129 for(
size_t k=kbegin; k<K; ++k ) {
4130 xmm1 -=
set( A(i,k) ) * B.load(k,j);
4133 C.store( i, j, xmm1 );
4137 for( ; remainder && j<N; ++j )
4139 const size_t iend(
UPP ? j+1UL : M );
4140 size_t i(
LOW ? j : 0UL );
4142 for( ; (i+2UL) <= iend; i+=2UL )
4144 const size_t kbegin( ( IsUpper_v<MT4> )
4145 ?( ( IsLower_v<MT5> )
4146 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4147 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4148 :( IsLower_v<MT5> ? j : 0UL ) );
4149 const size_t kend( ( IsLower_v<MT4> )
4150 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4156 for(
size_t k=kbegin; k<kend; ++k ) {
4157 value1 -= A(i ,k) * B(k,j);
4158 value2 -= A(i+1UL,k) * B(k,j);
4162 C(i+1UL,j) = value2;
4167 const size_t kbegin( ( IsUpper_v<MT4> )
4168 ?( ( IsLower_v<MT5> )
4169 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4170 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4171 :( IsLower_v<MT5> ? j : 0UL ) );
4175 for(
size_t k=kbegin; k<K; ++k ) {
4176 value -= A(i,k) * B(k,j);
4201 template<
typename MT3
4204 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4205 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4212 const ForwardFunctor fwd;
4214 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4215 const OppositeType_t<MT4> tmp(
serial( A ) );
4216 subAssign( C, fwd( tmp * B ) );
4218 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4219 const OppositeType_t<MT5> tmp(
serial( B ) );
4220 subAssign( C, fwd( A * tmp ) );
4222 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4223 const OppositeType_t<MT4> tmp(
serial( A ) );
4224 subAssign( C, fwd( tmp * B ) );
4227 const OppositeType_t<MT5> tmp(
serial( B ) );
4228 subAssign( C, fwd( A * tmp ) );
4248 template<
typename MT3
4251 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4252 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4254 selectDefaultSubAssignKernel( C, A, B );
4274 template<
typename MT3
4277 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4278 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4304 template<
typename MT3
4307 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4308 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4310 selectLargeSubAssignKernel( C, A, B );
4316#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
4330 template<
typename MT3
4333 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4334 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4336 using ET = ElementType_t<MT3>;
4338 if( IsTriangular_v<MT4> ) {
4339 ResultType_t<MT3> tmp(
serial( B ) );
4340 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4341 subAssign( C, tmp );
4343 else if( IsTriangular_v<MT5> ) {
4344 ResultType_t<MT3> tmp(
serial( A ) );
4345 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4346 subAssign( C, tmp );
4349 gemm( C, A, B, ET(-1), ET(1) );
4371 template<
typename MT >
4372 friend inline auto subAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
4373 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4382 const ForwardFunctor fwd;
4384 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4385 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4387 subAssign( *lhs, fwd( A * B ) );
4409 template<
typename MT
4411 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
4423 schurAssign( *lhs, tmp );
4455 template<
typename MT
4458 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4465 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
4468 else if( rhs.lhs_.columns() == 0UL ) {
4503 template<
typename MT
4506 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4510 using TmpType = If_t< SO, OppositeType, ResultType >;
4522 const ForwardFunctor fwd;
4524 const TmpType tmp( rhs );
4545 template<
typename MT >
4547 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4556 const ForwardFunctor fwd;
4558 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4559 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4582 template<
typename MT
4585 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4592 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4626 template<
typename MT >
4628 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4637 const ForwardFunctor fwd;
4639 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4640 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4667 template<
typename MT
4670 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4677 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4711 template<
typename MT >
4713 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4722 const ForwardFunctor fwd;
4724 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.lhs_ ) );
4725 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.rhs_ ) );
4749 template<
typename MT
4809template<
typename MT1
4816class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4817 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4818 ,
private Computation
4823 using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4825 using RES = ResultType_t<MMM>;
4826 using RT1 = ResultType_t<MT1>;
4827 using RT2 = ResultType_t<MT2>;
4828 using ET1 = ElementType_t<RT1>;
4829 using ET2 = ElementType_t<RT2>;
4830 using CT1 = CompositeType_t<MT1>;
4831 using CT2 = CompositeType_t<MT2>;
4836 static constexpr bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4841 static constexpr bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4845 static constexpr bool SYM = ( SF && !( HF || LF || UF ) );
4846 static constexpr bool HERM = ( HF && !( LF || UF ) );
4847 static constexpr bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4848 static constexpr bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4857 template<
typename T1,
typename T2,
typename T3 >
4858 static constexpr bool CanExploitSymmetry_v =
4859 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4867 template<
typename T1,
typename T2,
typename T3 >
4868 static constexpr bool IsEvaluationRequired_v =
4869 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4876 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4877 static constexpr bool UseBlasKernel_v =
4879 !SYM && !HERM && !LOW && !UPP &&
4880 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4881 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4882 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4883 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4884 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4885 IsBLASCompatible_v< ElementType_t<T1> > &&
4886 IsBLASCompatible_v< ElementType_t<T2> > &&
4887 IsBLASCompatible_v< ElementType_t<T3> > &&
4888 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4889 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4890 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4897 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4898 static constexpr bool UseVectorizedDefaultKernel_v =
4899 ( useOptimizedKernels &&
4900 !IsDiagonal_v<T3> &&
4901 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4902 IsSIMDCombinable_v< ElementType_t<T1>
4906 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4907 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4914 using ForwardFunctor =
If_t< HERM
4930 using This = DMatScalarMultExpr<MMM,ST,false>;
4933 using BaseType = MatScalarMultExpr< DenseMatrix<This,false> >;
4937 , DeclHermTrait< MultTrait_t<RES,ST> >
4939 , DeclSymTrait< MultTrait_t<RES,ST> >
4942 , DeclDiagTrait< MultTrait_t<RES,ST> >
4943 , DeclLowTrait< MultTrait_t<RES,ST> > >
4945 , DeclUppTrait< MultTrait_t<RES,ST> >
4946 , MultTrait<RES,ST> > > > >::Type;
4951 using SIMDType = SIMDTrait_t<ElementType>;
4956 using LeftOperand =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4962 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4965 using RT = If_t< evaluateRight, const RT2, CT2 >;
4971 ( !IsDiagonal_v<MT2> &&
4972 MT1::simdEnabled && MT2::simdEnabled &&
4973 IsSIMDCombinable_v<ET1,ET2,ST> &&
4974 HasSIMDAdd_v<ET1,ET2> &&
4975 HasSIMDMult_v<ET1,ET2> );
4979 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
5025 if( j >=
matrix_.columns() ) {
5028 return (*
this)(i,j);
5037 inline size_t rows()
const {
5047 inline size_t columns()
const {
5078 template<
typename T >
5079 inline bool canAlias(
const T* alias )
const {
5080 return matrix_.canAlias( alias );
5090 template<
typename T >
5091 inline bool isAliased(
const T* alias )
const {
5092 return matrix_.isAliased( alias );
5113 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5115 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
5116 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
5138 template<
typename MT
5141 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5148 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5149 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5151 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
5154 else if( left.columns() == 0UL ) {
5169 DMatScalarMultExpr::selectAssignKernel( *lhs, A, B, rhs.scalar_ );
5184 template<
typename MT3
5188 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5190 if( ( IsDiagonal_v<MT5> ) ||
5191 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
5192 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5193 selectSmallAssignKernel( C, A, B, scalar );
5195 selectBlasAssignKernel( C, A, B, scalar );
5213 template<
typename MT3
5217 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5218 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5220 const size_t M( A.rows() );
5221 const size_t N( B.columns() );
5222 const size_t K( A.columns() );
5226 for(
size_t i=0UL; i<M; ++i )
5228 const size_t kbegin( ( IsUpper_v<MT4> )
5229 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5231 const size_t kend( ( IsLower_v<MT4> )
5232 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5236 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5237 for(
size_t j=0UL; j<N; ++j ) {
5244 const size_t jbegin( ( IsUpper_v<MT5> )
5245 ?( ( IsStrictlyUpper_v<MT5> )
5246 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
5247 :( UPP ?
max(i,kbegin) : kbegin ) )
5248 :( UPP ? i : 0UL ) );
5249 const size_t jend( ( IsLower_v<MT5> )
5250 ?( ( IsStrictlyLower_v<MT5> )
5251 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
5252 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
5253 :( LOW ? i+1UL : N ) );
5255 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5256 for(
size_t j=0UL; j<jbegin; ++j ) {
5260 else if( IsStrictlyUpper_v<MT5> ) {
5263 for(
size_t j=jbegin; j<jend; ++j ) {
5264 C(i,j) = A(i,kbegin) * B(kbegin,j);
5266 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5267 for(
size_t j=jend; j<N; ++j ) {
5271 else if( IsStrictlyLower_v<MT5> ) {
5272 reset( C(i,N-1UL) );
5276 for(
size_t k=kbegin+1UL; k<kend; ++k )
5278 const size_t jbegin( ( IsUpper_v<MT5> )
5279 ?( ( IsStrictlyUpper_v<MT5> )
5280 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
5281 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
5282 :( SYM || HERM || UPP ? i : 0UL ) );
5283 const size_t jend( ( IsLower_v<MT5> )
5284 ?( ( IsStrictlyLower_v<MT5> )
5285 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
5286 :( LOW ?
min(i+1UL,k) : k ) )
5287 :( LOW ? i+1UL : N ) );
5289 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5292 for(
size_t j=jbegin; j<jend; ++j ) {
5293 C(i,j) += A(i,k) * B(k,j);
5295 if( IsLower_v<MT5> ) {
5296 C(i,jend) = A(i,k) * B(k,jend);
5301 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5302 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5303 :( SYM || HERM || UPP ? i : 0UL ) );
5304 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5305 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5306 :( LOW ? i+1UL : N ) );
5308 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5311 for(
size_t j=jbegin; j<jend; ++j ) {
5318 for(
size_t i=1UL; i<M; ++i ) {
5319 for(
size_t j=0UL; j<i; ++j ) {
5320 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5341 template<
typename MT3
5345 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5346 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5350 const size_t M( A.rows() );
5351 const size_t N( B.columns() );
5353 for(
size_t i=0UL; i<M; ++i )
5355 const size_t jbegin( ( IsUpper_v<MT4> )
5356 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5358 const size_t jend( ( IsLower_v<MT4> )
5359 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5363 if( IsUpper_v<MT4> ) {
5364 for(
size_t j=0UL; j<jbegin; ++j ) {
5368 for(
size_t j=jbegin; j<jend; ++j ) {
5369 C(i,j) = A(i,j) * B(j,j) * scalar;
5371 if( IsLower_v<MT4> ) {
5372 for(
size_t j=jend; j<N; ++j ) {
5394 template<
typename MT3
5398 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5399 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5403 const size_t M( A.rows() );
5404 const size_t N( B.columns() );
5406 for(
size_t i=0UL; i<M; ++i )
5408 const size_t jbegin( ( IsUpper_v<MT5> )
5409 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5411 const size_t jend( ( IsLower_v<MT5> )
5412 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5416 if( IsUpper_v<MT5> ) {
5417 for(
size_t j=0UL; j<jbegin; ++j ) {
5421 for(
size_t j=jbegin; j<jend; ++j ) {
5422 C(i,j) = A(i,i) * B(i,j) * scalar;
5424 if( IsLower_v<MT5> ) {
5425 for(
size_t j=jend; j<N; ++j ) {
5447 template<
typename MT3
5451 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5452 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5458 for(
size_t i=0UL; i<A.rows(); ++i ) {
5459 C(i,i) = A(i,i) * B(i,i) * scalar;
5478 template<
typename MT3
5482 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5483 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5485 selectDefaultAssignKernel( C, A, B, scalar );
5504 template<
typename MT3
5508 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5509 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5511 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5513 const size_t M( A.rows() );
5514 const size_t N( B.columns() );
5515 const size_t K( A.columns() );
5522 const SIMDType factor(
set( scalar ) );
5526 if( IsIntegral_v<ElementType> )
5528 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
5529 for(
size_t i=0UL; i<M; ++i )
5531 const size_t kbegin( ( IsUpper_v<MT4> )
5532 ?( ( IsLower_v<MT5> )
5533 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5534 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5535 :( IsLower_v<MT5> ? j : 0UL ) );
5536 const size_t kend( ( IsLower_v<MT4> )
5537 ?( ( IsUpper_v<MT5> )
5538 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5539 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5540 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5546 SIMDType a1(
set( A(i,k) ) );
5547 SIMDType xmm1( a1 * B.load(k,j ) );
5548 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
5549 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
5550 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
5551 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
5552 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
5553 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
5554 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
5556 for( ++k; k<kend; ++k ) {
5558 xmm1 += a1 * B.load(k,j );
5559 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5560 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5561 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5562 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5563 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
5564 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
5565 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
5568 C.store( i, j , xmm1 * factor );
5569 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5570 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5571 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5572 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5573 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
5574 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
5575 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
5579 const SIMDType
zero;
5580 C.store( i, j ,
zero );
5593 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
5597 for( ; (i+2UL) <= M; i+=2UL )
5599 const size_t kbegin( ( IsUpper_v<MT4> )
5600 ?( ( IsLower_v<MT5> )
5601 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5602 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5603 :( IsLower_v<MT5> ? j : 0UL ) );
5604 const size_t kend( ( IsLower_v<MT4> )
5605 ?( ( IsUpper_v<MT5> )
5606 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5607 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5608 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5614 SIMDType a1(
set( A(i ,k) ) );
5615 SIMDType a2(
set( A(i+1UL,k) ) );
5616 SIMDType b1( B.load(k,j ) );
5617 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5618 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5619 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5620 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
5621 SIMDType xmm1 ( a1 * b1 );
5622 SIMDType xmm2 ( a1 * b2 );
5623 SIMDType xmm3 ( a1 * b3 );
5624 SIMDType xmm4 ( a1 * b4 );
5625 SIMDType xmm5 ( a1 * b5 );
5626 SIMDType xmm6 ( a2 * b1 );
5627 SIMDType xmm7 ( a2 * b2 );
5628 SIMDType xmm8 ( a2 * b3 );
5629 SIMDType xmm9 ( a2 * b4 );
5630 SIMDType xmm10( a2 * b5 );
5632 for( ++k; k<kend; ++k ) {
5633 a1 =
set( A(i ,k) );
5634 a2 =
set( A(i+1UL,k) );
5652 C.store( i , j , xmm1 * factor );
5653 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5654 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5655 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5656 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
5657 C.store( i+1UL, j , xmm6 * factor );
5658 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
5659 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
5660 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
5661 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
5665 const SIMDType
zero;
5666 C.store( i , j ,
zero );
5671 C.store( i+1UL, j ,
zero );
5681 const size_t kbegin( ( IsUpper_v<MT4> )
5682 ?( ( IsLower_v<MT5> )
5683 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5684 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5685 :( IsLower_v<MT5> ? j : 0UL ) );
5686 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5692 SIMDType a1(
set( A(i,k) ) );
5693 SIMDType xmm1( a1 * B.load(k,j ) );
5694 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
5695 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
5696 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
5697 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
5699 for( ++k; k<kend; ++k ) {
5701 xmm1 += a1 * B.load(k,j );
5702 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5703 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5704 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5705 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5708 C.store( i, j , xmm1 * factor );
5709 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5710 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5711 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5712 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5716 const SIMDType
zero;
5717 C.store( i, j ,
zero );
5728 const size_t iend( UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
5734 for(
size_t jj=j; jj<jjend; ++jj ) {
5735 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5742 for(
size_t jj=j; jj<jjend; ++jj ) {
5748 for( ; (i+2UL) <= iend; i+=2UL )
5750 const size_t kbegin( ( IsUpper_v<MT4> )
5751 ?( ( IsLower_v<MT5> )
5752 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5753 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5754 :( IsLower_v<MT5> ? j : 0UL ) );
5755 const size_t kend( ( IsLower_v<MT4> )
5756 ?( ( IsUpper_v<MT5> )
5757 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5758 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5759 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5765 SIMDType a1(
set( A(i ,k) ) );
5766 SIMDType a2(
set( A(i+1UL,k) ) );
5767 SIMDType b1( B.load(k,j ) );
5768 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5769 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5770 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5771 SIMDType xmm1( a1 * b1 );
5772 SIMDType xmm2( a1 * b2 );
5773 SIMDType xmm3( a1 * b3 );
5774 SIMDType xmm4( a1 * b4 );
5775 SIMDType xmm5( a2 * b1 );
5776 SIMDType xmm6( a2 * b2 );
5777 SIMDType xmm7( a2 * b3 );
5778 SIMDType xmm8( a2 * b4 );
5780 for( ++k; k<kend; ++k ) {
5781 a1 =
set( A(i ,k) );
5782 a2 =
set( A(i+1UL,k) );
5797 C.store( i , j , xmm1 * factor );
5798 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5799 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5800 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5801 C.store( i+1UL, j , xmm5 * factor );
5802 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
5803 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
5804 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
5808 const SIMDType
zero;
5809 C.store( i , j ,
zero );
5813 C.store( i+1UL, j ,
zero );
5822 const size_t kbegin( ( IsUpper_v<MT4> )
5823 ?( ( IsLower_v<MT5> )
5824 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5825 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5826 :( IsLower_v<MT5> ? j : 0UL ) );
5827 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5833 SIMDType a1(
set( A(i,k) ) );
5834 SIMDType xmm1( a1 * B.load(k,j ) );
5835 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
5836 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
5837 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
5839 for( ++k; k<kend; ++k ) {
5841 xmm1 += a1 * B.load(k,j );
5842 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5843 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5844 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5847 C.store( i, j , xmm1 * factor );
5848 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5849 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5850 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5854 const SIMDType
zero;
5855 C.store( i, j ,
zero );
5867 for(
size_t jj=j; jj<jjend; ++jj ) {
5876 const size_t iend( UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
5882 for(
size_t jj=j; jj<jjend; ++jj ) {
5883 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5890 for(
size_t jj=j; jj<jjend; ++jj ) {
5896 for( ; (i+2UL) <= iend; i+=2UL )
5898 const size_t kbegin( ( IsUpper_v<MT4> )
5899 ?( ( IsLower_v<MT5> )
5900 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5901 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5902 :( IsLower_v<MT5> ? j : 0UL ) );
5903 const size_t kend( ( IsLower_v<MT4> )
5904 ?( ( IsUpper_v<MT5> )
5905 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5906 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5907 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5913 SIMDType a1(
set( A(i ,k) ) );
5914 SIMDType a2(
set( A(i+1UL,k) ) );
5915 SIMDType b1( B.load(k,j ) );
5916 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5917 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5918 SIMDType xmm1( a1 * b1 );
5919 SIMDType xmm2( a1 * b2 );
5920 SIMDType xmm3( a1 * b3 );
5921 SIMDType xmm4( a2 * b1 );
5922 SIMDType xmm5( a2 * b2 );
5923 SIMDType xmm6( a2 * b3 );
5925 for( ++k; k<kend; ++k ) {
5926 a1 =
set( A(i ,k) );
5927 a2 =
set( A(i+1UL,k) );
5939 C.store( i , j , xmm1 * factor );
5940 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5941 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5942 C.store( i+1UL, j , xmm4 * factor );
5943 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
5944 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
5948 const SIMDType
zero;
5949 C.store( i , j ,
zero );
5952 C.store( i+1UL, j ,
zero );
5960 const size_t kbegin( ( IsUpper_v<MT4> )
5961 ?( ( IsLower_v<MT5> )
5962 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5963 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5964 :( IsLower_v<MT5> ? j : 0UL ) );
5965 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5971 SIMDType a1(
set( A(i,k) ) );
5972 SIMDType xmm1( a1 * B.load(k,j ) );
5973 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
5974 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
5976 for( ++k; k<kend; ++k ) {
5978 xmm1 += a1 * B.load(k,j );
5979 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5980 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5983 C.store( i, j , xmm1 * factor );
5984 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5985 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5989 const SIMDType
zero;
5990 C.store( i, j ,
zero );
6001 for(
size_t jj=j; jj<jjend; ++jj ) {
6010 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
6016 for(
size_t jj=j; jj<jjend; ++jj ) {
6017 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
6024 for(
size_t jj=j; jj<jjend; ++jj ) {
6030 for( ; (i+4UL) <= iend; i+=4UL )
6032 const size_t kbegin( ( IsUpper_v<MT4> )
6033 ?( ( IsLower_v<MT5> )
6034 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6035 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6036 :( IsLower_v<MT5> ? j : 0UL ) );
6037 const size_t kend( ( IsLower_v<MT4> )
6038 ?( ( IsUpper_v<MT5> )
6039 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
6040 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6041 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6047 SIMDType a1(
set( A(i ,k) ) );
6048 SIMDType a2(
set( A(i+1UL,k) ) );
6049 SIMDType a3(
set( A(i+2UL,k) ) );
6050 SIMDType a4(
set( A(i+3UL,k) ) );
6051 SIMDType b1( B.load(k,j ) );
6052 SIMDType b2( B.load(k,j+
SIMDSIZE) );
6053 SIMDType xmm1( a1 * b1 );
6054 SIMDType xmm2( a1 * b2 );
6055 SIMDType xmm3( a2 * b1 );
6056 SIMDType xmm4( a2 * b2 );
6057 SIMDType xmm5( a3 * b1 );
6058 SIMDType xmm6( a3 * b2 );
6059 SIMDType xmm7( a4 * b1 );
6060 SIMDType xmm8( a4 * b2 );
6062 for( ++k; k<kend; ++k ) {
6063 a1 =
set( A(i ,k) );
6064 a2 =
set( A(i+1UL,k) );
6065 a3 =
set( A(i+2UL,k) );
6066 a4 =
set( A(i+3UL,k) );
6079 C.store( i , j , xmm1 * factor );
6080 C.store( i , j+
SIMDSIZE, xmm2 * factor );
6081 C.store( i+1UL, j , xmm3 * factor );
6082 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
6083 C.store( i+2UL, j , xmm5 * factor );
6084 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
6085 C.store( i+3UL, j , xmm7 * factor );
6086 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
6090 const SIMDType
zero;
6091 C.store( i , j ,
zero );
6093 C.store( i+1UL, j ,
zero );
6095 C.store( i+2UL, j ,
zero );
6097 C.store( i+3UL, j ,
zero );
6102 for( ; (i+3UL) <= iend; i+=3UL )
6104 const size_t kbegin( ( IsUpper_v<MT4> )
6105 ?( ( IsLower_v<MT5> )
6106 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6107 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6108 :( IsLower_v<MT5> ? j : 0UL ) );
6109 const size_t kend( ( IsLower_v<MT4> )
6110 ?( ( IsUpper_v<MT5> )
6111 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
6112 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
6113 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6119 SIMDType a1(
set( A(i ,k) ) );
6120 SIMDType a2(
set( A(i+1UL,k) ) );
6121 SIMDType a3(
set( A(i+2UL,k) ) );
6122 SIMDType b1( B.load(k,j ) );
6123 SIMDType b2( B.load(k,j+
SIMDSIZE) );
6124 SIMDType xmm1( a1 * b1 );
6125 SIMDType xmm2( a1 * b2 );
6126 SIMDType xmm3( a2 * b1 );
6127 SIMDType xmm4( a2 * b2 );
6128 SIMDType xmm5( a3 * b1 );
6129 SIMDType xmm6( a3 * b2 );
6131 for( ++k; k<kend; ++k ) {
6132 a1 =
set( A(i ,k) );
6133 a2 =
set( A(i+1UL,k) );
6134 a3 =
set( A(i+2UL,k) );
6145 C.store( i , j , xmm1 * factor );
6146 C.store( i , j+
SIMDSIZE, xmm2 * factor );
6147 C.store( i+1UL, j , xmm3 * factor );
6148 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
6149 C.store( i+2UL, j , xmm5 * factor );
6150 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
6154 const SIMDType
zero;
6155 C.store( i , j ,
zero );
6157 C.store( i+1UL, j ,
zero );
6159 C.store( i+2UL, j ,
zero );
6164 for( ; (i+2UL) <= iend; i+=2UL )
6166 const size_t kbegin( ( IsUpper_v<MT4> )
6167 ?( ( IsLower_v<MT5> )
6168 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6169 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6170 :( IsLower_v<MT5> ? j : 0UL ) );
6171 const size_t kend( ( IsLower_v<MT4> )
6172 ?( ( IsUpper_v<MT5> )
6173 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
6174 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6175 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6181 SIMDType a1(
set( A(i ,k) ) );
6182 SIMDType a2(
set( A(i+1UL,k) ) );
6183 SIMDType b1( B.load(k,j ) );
6184 SIMDType b2( B.load(k,j+
SIMDSIZE) );
6185 SIMDType xmm1( a1 * b1 );
6186 SIMDType xmm2( a1 * b2 );
6187 SIMDType xmm3( a2 * b1 );
6188 SIMDType xmm4( a2 * b2 );
6190 for( ++k; k<kend; ++k ) {
6191 a1 =
set( A(i ,k) );
6192 a2 =
set( A(i+1UL,k) );
6201 C.store( i , j , xmm1 * factor );
6202 C.store( i , j+
SIMDSIZE, xmm2 * factor );
6203 C.store( i+1UL, j , xmm3 * factor );
6204 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
6208 const SIMDType
zero;
6209 C.store( i , j ,
zero );
6211 C.store( i+1UL, j ,
zero );
6218 const size_t kbegin( ( IsUpper_v<MT4> )
6219 ?( ( IsLower_v<MT5> )
6220 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6221 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6222 :( IsLower_v<MT5> ? j : 0UL ) );
6223 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
6229 SIMDType a1(
set( A(i,k) ) );
6230 SIMDType xmm1( a1 * B.load(k,j ) );
6231 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
6233 for( ++k; k<kend; ++k ) {
6235 xmm1 += a1 * B.load(k,j );
6239 C.store( i, j , xmm1 * factor );
6240 C.store( i, j+
SIMDSIZE, xmm2 * factor );
6244 const SIMDType
zero;
6245 C.store( i, j ,
zero );
6255 for(
size_t jj=j; jj<jjend; ++jj ) {
6270 for(
size_t jj=j; jj<jjend; ++jj ) {
6271 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
6278 for(
size_t jj=j; jj<jjend; ++jj ) {
6284 for( ; (i+4UL) <= iend; i+=4UL )
6286 const size_t kbegin( ( IsUpper_v<MT4> )
6287 ?( ( IsLower_v<MT5> )
6288 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6289 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6290 :( IsLower_v<MT5> ? j : 0UL ) );
6291 const size_t kend( ( IsLower_v<MT4> )
6292 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
6299 SIMDType b1( B.load(k,j) );
6300 SIMDType xmm1(
set( A(i ,k) ) * b1 );
6301 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
6302 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
6303 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
6305 for( ++k; k<kend; ++k ) {
6307 xmm1 +=
set( A(i ,k) ) * b1;
6308 xmm2 +=
set( A(i+1UL,k) ) * b1;
6309 xmm3 +=
set( A(i+2UL,k) ) * b1;
6310 xmm4 +=
set( A(i+3UL,k) ) * b1;
6313 C.store( i , j, xmm1 * factor );
6314 C.store( i+1UL, j, xmm2 * factor );
6315 C.store( i+2UL, j, xmm3 * factor );
6316 C.store( i+3UL, j, xmm4 * factor );
6320 const SIMDType
zero;
6321 C.store( i , j,
zero );
6322 C.store( i+1UL, j,
zero );
6323 C.store( i+2UL, j,
zero );
6324 C.store( i+3UL, j,
zero );
6328 for( ; (i+3UL) <= iend; i+=3UL )
6330 const size_t kbegin( ( IsUpper_v<MT4> )
6331 ?( ( IsLower_v<MT5> )
6332 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6333 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6334 :( IsLower_v<MT5> ? j : 0UL ) );
6335 const size_t kend( ( IsLower_v<MT4> )
6336 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6343 SIMDType b1( B.load(k,j) );
6344 SIMDType xmm1(
set( A(i ,k) ) * b1 );
6345 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
6346 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
6348 for( ++k; k<kend; ++k ) {
6350 xmm1 +=
set( A(i ,k) ) * b1;
6351 xmm2 +=
set( A(i+1UL,k) ) * b1;
6352 xmm3 +=
set( A(i+2UL,k) ) * b1;
6355 C.store( i , j, xmm1 * factor );
6356 C.store( i+1UL, j, xmm2 * factor );
6357 C.store( i+2UL, j, xmm3 * factor );
6361 const SIMDType
zero;
6362 C.store( i , j,
zero );
6363 C.store( i+1UL, j,
zero );
6364 C.store( i+2UL, j,
zero );
6368 for( ; (i+2UL) <= iend; i+=2UL )
6370 const size_t kbegin( ( IsUpper_v<MT4> )
6371 ?( ( IsLower_v<MT5> )
6372 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6373 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6374 :( IsLower_v<MT5> ? j : 0UL ) );
6375 const size_t kend( ( IsLower_v<MT4> )
6376 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6383 SIMDType b1( B.load(k,j) );
6384 SIMDType xmm1(
set( A(i ,k) ) * b1 );
6385 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
6387 for( ++k; k<kend; ++k ) {
6389 xmm1 +=
set( A(i ,k) ) * b1;
6390 xmm2 +=
set( A(i+1UL,k) ) * b1;
6393 C.store( i , j, xmm1 * factor );
6394 C.store( i+1UL, j, xmm2 * factor );
6398 const SIMDType
zero;
6399 C.store( i , j,
zero );
6400 C.store( i+1UL, j,
zero );
6406 const size_t kbegin( ( IsUpper_v<MT4> )
6407 ?( ( IsLower_v<MT5> )
6408 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6409 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6410 :( IsLower_v<MT5> ? j : 0UL ) );
6416 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
6418 for( ++k; k<K; ++k ) {
6419 xmm1 +=
set( A(i,k) ) * B.load(k,j);
6422 C.store( i, j, xmm1 * factor );
6426 const SIMDType
zero;
6427 C.store( i, j,
zero );
6436 for(
size_t jj=j; jj<jjend; ++jj ) {
6443 for( ; remainder && j<N; ++j )
6449 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
6458 for( ; (i+2UL) <= M; i+=2UL )
6460 const size_t kbegin( ( IsUpper_v<MT4> )
6461 ?( ( IsLower_v<MT5> )
6462 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6464 :( IsLower_v<MT5> ? j : 0UL ) );
6465 const size_t kend( ( IsLower_v<MT4> )
6466 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6476 for( ++k; k<kend; ++k ) {
6477 value1 += A(i ,k) * B(k,j);
6478 value2 += A(i+1UL,k) * B(k,j);
6481 C(i ,j) = value1 * scalar;
6482 C(i+1UL,j) = value2 * scalar;
6487 reset( C(i+1UL,j) );
6493 const size_t kbegin( ( IsUpper_v<MT4> )
6494 ?( ( IsLower_v<MT5> )
6495 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6496 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6497 :( IsLower_v<MT5> ? j : 0UL ) );
6505 for( ++k; k<K; ++k ) {
6506 value += A(i,k) * B(k,j);
6509 C(i,j) = value * scalar;
6535 template<
typename MT3
6539 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6540 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6547 const ForwardFunctor fwd;
6549 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6550 const OppositeType_t<MT4> tmp(
serial( A ) );
6551 assign( C, fwd( tmp * B ) * scalar );
6553 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6554 const OppositeType_t<MT5> tmp(
serial( B ) );
6555 assign( C, fwd( A * tmp ) * scalar );
6557 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6558 const OppositeType_t<MT4> tmp(
serial( A ) );
6559 assign( C, fwd( tmp * B ) * scalar );
6562 const OppositeType_t<MT5> tmp(
serial( B ) );
6563 assign( C, fwd( A * tmp ) * scalar );
6582 template<
typename MT3
6586 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6587 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6589 selectDefaultAssignKernel( C, A, B, scalar );
6608 template<
typename MT3
6612 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6613 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6616 smmm( C, A, B, scalar );
6618 hmmm( C, A, B, scalar );
6620 lmmm( C, A, B, scalar, ST2(0) );
6622 ummm( C, A, B, scalar, ST2(0) );
6624 mmm( C, A, B, scalar, ST2(0) );
6642 template<
typename MT3
6646 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6647 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6649 selectLargeAssignKernel( C, A, B, scalar );
6654#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
6668 template<
typename MT3
6672 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6673 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6675 using ET = ElementType_t<MT3>;
6677 if( IsTriangular_v<MT4> ) {
6679 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6681 else if( IsTriangular_v<MT5> ) {
6683 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6686 gemm( C, A, B,
ET(scalar),
ET(0) );
6704 template<
typename MT
6707 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6711 using TmpType = If_t< SO, OppositeType, ResultType >;
6723 const ForwardFunctor fwd;
6725 const TmpType tmp(
serial( rhs ) );
6726 assign( *lhs, fwd( tmp ) );
6744 template<
typename MT >
6746 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6755 const ForwardFunctor fwd;
6757 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
6758 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
6760 assign( *lhs, fwd( A * B ) * rhs.scalar_ );
6776 template<
typename MT
6778 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6779 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6786 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6787 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6789 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
6803 DMatScalarMultExpr::selectAddAssignKernel( *lhs, A, B, rhs.scalar_ );
6818 template<
typename MT3
6822 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6824 if( ( IsDiagonal_v<MT5> ) ||
6825 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
6826 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6827 selectSmallAddAssignKernel( C, A, B, scalar );
6829 selectBlasAddAssignKernel( C, A, B, scalar );
6847 template<
typename MT3
6851 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6852 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6855 addAssign( C, tmp );
6873 template<
typename MT3
6877 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6878 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6882 const size_t M( A.rows() );
6883 const size_t N( B.columns() );
6885 for(
size_t i=0UL; i<M; ++i )
6887 const size_t jbegin( ( IsUpper_v<MT4> )
6888 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6890 const size_t jend( ( IsLower_v<MT4> )
6891 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6895 const size_t jnum( jend - jbegin );
6896 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
6899 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6900 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6901 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6904 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6924 template<
typename MT3
6928 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6929 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6933 const size_t M( A.rows() );
6934 const size_t N( B.columns() );
6936 for(
size_t i=0UL; i<M; ++i )
6938 const size_t jbegin( ( IsUpper_v<MT5> )
6939 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6941 const size_t jend( ( IsLower_v<MT5> )
6942 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6946 const size_t jnum( jend - jbegin );
6947 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
6950 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6951 C(i,j ) += A(i,i) * B(i,j ) * scalar;
6952 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6955 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6975 template<
typename MT3
6979 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6980 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6984 for(
size_t i=0UL; i<A.rows(); ++i ) {
6985 C(i,i) += A(i,i) * B(i,i) * scalar;
7004 template<
typename MT3
7008 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7009 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7011 selectDefaultAddAssignKernel( C, A, B, scalar );
7030 template<
typename MT3
7034 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7035 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7037 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7039 const size_t M( A.rows() );
7040 const size_t N( B.columns() );
7041 const size_t K( A.columns() );
7048 const SIMDType factor(
set( scalar ) );
7052 if( IsIntegral_v<ElementType> )
7055 for(
size_t i=0UL; i<M; ++i )
7057 const size_t kbegin( ( IsUpper_v<MT4> )
7058 ?( ( IsLower_v<MT5> )
7059 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7060 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7061 :( IsLower_v<MT5> ? j : 0UL ) );
7062 const size_t kend( ( IsLower_v<MT4> )
7063 ?( ( IsUpper_v<MT5> )
7064 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
7065 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7066 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
7072 SIMDType a1(
set( A(i,k) ) );
7073 SIMDType xmm1( a1 * B.load(k,j ) );
7074 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
7075 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
7076 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
7077 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
7078 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
7079 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
7080 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
7082 for( ++k; k<kend; ++k ) {
7084 xmm1 += a1 * B.load(k,j );
7085 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7086 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7087 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7088 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7089 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
7090 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
7091 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
7094 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7111 for( ; (i+2UL) <= M; i+=2UL )
7113 const size_t kbegin( ( IsUpper_v<MT4> )
7114 ?( ( IsLower_v<MT5> )
7115 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7116 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7117 :( IsLower_v<MT5> ? j : 0UL ) );
7118 const size_t kend( ( IsLower_v<MT4> )
7119 ?( ( IsUpper_v<MT5> )
7120 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
7121 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7122 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
7128 SIMDType a1(
set( A(i ,k) ) );
7129 SIMDType a2(
set( A(i+1UL,k) ) );
7130 SIMDType b1( B.load(k,j ) );
7131 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7132 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7133 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7134 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
7135 SIMDType xmm1 ( a1 * b1 );
7136 SIMDType xmm2 ( a1 * b2 );
7137 SIMDType xmm3 ( a1 * b3 );
7138 SIMDType xmm4 ( a1 * b4 );
7139 SIMDType xmm5 ( a1 * b5 );
7140 SIMDType xmm6 ( a2 * b1 );
7141 SIMDType xmm7 ( a2 * b2 );
7142 SIMDType xmm8 ( a2 * b3 );
7143 SIMDType xmm9 ( a2 * b4 );
7144 SIMDType xmm10( a2 * b5 );
7146 for( ++k; k<kend; ++k ) {
7147 a1 =
set( A(i ,k) );
7148 a2 =
set( A(i+1UL,k) );
7166 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7171 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
7173 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
7174 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
7175 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
7181 const size_t kbegin( ( IsUpper_v<MT4> )
7182 ?( ( IsLower_v<MT5> )
7183 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7184 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7185 :( IsLower_v<MT5> ? j : 0UL ) );
7186 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
7192 SIMDType a1(
set( A(i,k) ) );
7193 SIMDType xmm1( a1 * B.load(k,j ) );
7194 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
7195 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
7196 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
7197 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
7199 for( ++k; k<kend; ++k ) {
7201 xmm1 += a1 * B.load(k,j );
7202 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7203 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7204 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7205 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7208 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7221 for( ; (i+2UL) <= M; i+=2UL )
7223 const size_t kbegin( ( IsUpper_v<MT4> )
7224 ?( ( IsLower_v<MT5> )
7225 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7226 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7227 :( IsLower_v<MT5> ? j : 0UL ) );
7228 const size_t kend( ( IsLower_v<MT4> )
7229 ?( ( IsUpper_v<MT5> )
7230 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
7231 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7232 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
7238 SIMDType a1(
set( A(i ,k) ) );
7239 SIMDType a2(
set( A(i+1UL,k) ) );
7240 SIMDType b1( B.load(k,j ) );
7241 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7242 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7243 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7244 SIMDType xmm1( a1 * b1 );
7245 SIMDType xmm2( a1 * b2 );
7246 SIMDType xmm3( a1 * b3 );
7247 SIMDType xmm4( a1 * b4 );
7248 SIMDType xmm5( a2 * b1 );
7249 SIMDType xmm6( a2 * b2 );
7250 SIMDType xmm7( a2 * b3 );
7251 SIMDType xmm8( a2 * b4 );
7253 for( ++k; k<kend; ++k ) {
7254 a1 =
set( A(i ,k) );
7255 a2 =
set( A(i+1UL,k) );
7270 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7274 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
7276 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
7277 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
7283 const size_t kbegin( ( IsUpper_v<MT4> )
7284 ?( ( IsLower_v<MT5> )
7285 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7286 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7287 :( IsLower_v<MT5> ? j : 0UL ) );
7288 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
7294 SIMDType a1(
set( A(i,k) ) );
7295 SIMDType xmm1( a1 * B.load(k,j ) );
7296 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
7297 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
7298 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
7300 for( ++k; k<kend; ++k ) {
7302 xmm1 += a1 * B.load(k,j );
7303 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7304 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7305 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7308 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7320 for( ; (i+2UL) <= M; i+=2UL )
7322 const size_t kbegin( ( IsUpper_v<MT4> )
7323 ?( ( IsLower_v<MT5> )
7324 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7325 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7326 :( IsLower_v<MT5> ? j : 0UL ) );
7327 const size_t kend( ( IsLower_v<MT4> )
7328 ?( ( IsUpper_v<MT5> )
7329 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
7330 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7331 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
7337 SIMDType a1(
set( A(i ,k) ) );
7338 SIMDType a2(
set( A(i+1UL,k) ) );
7339 SIMDType b1( B.load(k,j ) );
7340 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7341 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7342 SIMDType xmm1( a1 * b1 );
7343 SIMDType xmm2( a1 * b2 );
7344 SIMDType xmm3( a1 * b3 );
7345 SIMDType xmm4( a2 * b1 );
7346 SIMDType xmm5( a2 * b2 );
7347 SIMDType xmm6( a2 * b3 );
7349 for( ++k; k<kend; ++k ) {
7350 a1 =
set( A(i ,k) );
7351 a2 =
set( A(i+1UL,k) );
7363 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7366 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
7368 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
7374 const size_t kbegin( ( IsUpper_v<MT4> )
7375 ?( ( IsLower_v<MT5> )
7376 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7377 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7378 :( IsLower_v<MT5> ? j : 0UL ) );
7379 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
7385 SIMDType a1(
set( A(i,k) ) );
7386 SIMDType xmm1( a1 * B.load(k,j ) );
7387 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
7388 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
7390 for( ++k; k<kend; ++k ) {
7392 xmm1 += a1 * B.load(k,j );
7393 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7394 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7397 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7406 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
7407 size_t i( LOW ? j : 0UL );
7409 for( ; (i+4UL) <= iend; i+=4UL )
7411 const size_t kbegin( ( IsUpper_v<MT4> )
7412 ?( ( IsLower_v<MT5> )
7413 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7414 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7415 :( IsLower_v<MT5> ? j : 0UL ) );
7416 const size_t kend( ( IsLower_v<MT4> )
7417 ?( ( IsUpper_v<MT5> )
7418 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
7419 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
7420 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7426 SIMDType a1(
set( A(i ,k) ) );
7427 SIMDType a2(
set( A(i+1UL,k) ) );
7428 SIMDType a3(
set( A(i+2UL,k) ) );
7429 SIMDType a4(
set( A(i+3UL,k) ) );
7430 SIMDType b1( B.load(k,j ) );
7431 SIMDType b2( B.load(k,j+
SIMDSIZE) );
7432 SIMDType xmm1( a1 * b1 );
7433 SIMDType xmm2( a1 * b2 );
7434 SIMDType xmm3( a2 * b1 );
7435 SIMDType xmm4( a2 * b2 );
7436 SIMDType xmm5( a3 * b1 );
7437 SIMDType xmm6( a3 * b2 );
7438 SIMDType xmm7( a4 * b1 );
7439 SIMDType xmm8( a4 * b2 );
7441 for( ++k; k<kend; ++k ) {
7442 a1 =
set( A(i ,k) );
7443 a2 =
set( A(i+1UL,k) );
7444 a3 =
set( A(i+2UL,k) );
7445 a4 =
set( A(i+3UL,k) );
7458 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7460 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7462 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7464 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
7469 for( ; (i+3UL) <= iend; i+=3UL )
7471 const size_t kbegin( ( IsUpper_v<MT4> )
7472 ?( ( IsLower_v<MT5> )
7473 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7474 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7475 :( IsLower_v<MT5> ? j : 0UL ) );
7476 const size_t kend( ( IsLower_v<MT4> )
7477 ?( ( IsUpper_v<MT5> )
7478 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
7479 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7480 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7486 SIMDType a1(
set( A(i ,k) ) );
7487 SIMDType a2(
set( A(i+1UL,k) ) );
7488 SIMDType a3(
set( A(i+2UL,k) ) );
7489 SIMDType b1( B.load(k,j ) );
7490 SIMDType b2( B.load(k,j+
SIMDSIZE) );
7491 SIMDType xmm1( a1 * b1 );
7492 SIMDType xmm2( a1 * b2 );
7493 SIMDType xmm3( a2 * b1 );
7494 SIMDType xmm4( a2 * b2 );
7495 SIMDType xmm5( a3 * b1 );
7496 SIMDType xmm6( a3 * b2 );
7498 for( ++k; k<kend; ++k ) {
7499 a1 =
set( A(i ,k) );
7500 a2 =
set( A(i+1UL,k) );
7501 a3 =
set( A(i+2UL,k) );
7512 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7514 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7516 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7521 for( ; (i+2UL) <= iend; i+=2UL )
7523 const size_t kbegin( ( IsUpper_v<MT4> )
7524 ?( ( IsLower_v<MT5> )
7525 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7526 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7527 :( IsLower_v<MT5> ? j : 0UL ) );
7528 const size_t kend( ( IsLower_v<MT4> )
7529 ?( ( IsUpper_v<MT5> )
7530 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
7531 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7532 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7538 SIMDType a1(
set( A(i ,k) ) );
7539 SIMDType a2(
set( A(i+1UL,k) ) );
7540 SIMDType b1( B.load(k,j ) );
7541 SIMDType b2( B.load(k,j+
SIMDSIZE) );
7542 SIMDType xmm1( a1 * b1 );
7543 SIMDType xmm2( a1 * b2 );
7544 SIMDType xmm3( a2 * b1 );
7545 SIMDType xmm4( a2 * b2 );
7547 for( ++k; k<kend; ++k ) {
7548 a1 =
set( A(i ,k) );
7549 a2 =
set( A(i+1UL,k) );
7558 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7560 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7567 const size_t kbegin( ( IsUpper_v<MT4> )
7568 ?( ( IsLower_v<MT5> )
7569 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7570 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7571 :( IsLower_v<MT5> ? j : 0UL ) );
7572 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
7578 SIMDType a1(
set( A(i,k) ) );
7579 SIMDType xmm1( a1 * B.load(k,j ) );
7580 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
7582 for( ++k; k<kend; ++k ) {
7584 xmm1 += a1 * B.load(k,j );
7588 C.store( i, j , C.load(i,j ) + xmm1 * factor );
7596 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
7597 size_t i( LOW ? j : 0UL );
7599 for( ; (i+4UL) <= iend; i+=4UL )
7601 const size_t kbegin( ( IsUpper_v<MT4> )
7602 ?( ( IsLower_v<MT5> )
7603 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7604 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7605 :( IsLower_v<MT5> ? j : 0UL ) );
7606 const size_t kend( ( IsLower_v<MT4> )
7607 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
7614 SIMDType b1( B.load(k,j) );
7615 SIMDType xmm1(
set( A(i ,k) ) * b1 );
7616 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
7617 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
7618 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
7620 for( ++k; k<kend; ++k ) {
7622 xmm1 +=
set( A(i ,k) ) * b1;
7623 xmm2 +=
set( A(i+1UL,k) ) * b1;
7624 xmm3 +=
set( A(i+2UL,k) ) * b1;
7625 xmm4 +=
set( A(i+3UL,k) ) * b1;
7628 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7629 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7630 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
7631 C.store( i+3UL, j, C.load(i+3UL,j) + xmm4 * factor );
7635 for( ; (i+3UL) <= iend; i+=3UL )
7637 const size_t kbegin( ( IsUpper_v<MT4> )
7638 ?( ( IsLower_v<MT5> )
7639 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7640 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7641 :( IsLower_v<MT5> ? j : 0UL ) );
7642 const size_t kend( ( IsLower_v<MT4> )
7643 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7650 SIMDType b1( B.load(k,j) );
7651 SIMDType xmm1(
set( A(i ,k) ) * b1 );
7652 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
7653 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
7655 for( ++k; k<kend; ++k ) {
7657 xmm1 +=
set( A(i ,k) ) * b1;
7658 xmm2 +=
set( A(i+1UL,k) ) * b1;
7659 xmm3 +=
set( A(i+2UL,k) ) * b1;
7662 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7663 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7664 C.store( i+2UL, j, C.load(i+2UL,j) + xmm3 * factor );
7668 for( ; (i+2UL) <= iend; i+=2UL )
7670 const size_t kbegin( ( IsUpper_v<MT4> )
7671 ?( ( IsLower_v<MT5> )
7672 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7673 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7674 :( IsLower_v<MT5> ? j : 0UL ) );
7675 const size_t kend( ( IsLower_v<MT4> )
7676 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7683 SIMDType b1( B.load(k,j) );
7684 SIMDType xmm1(
set( A(i ,k) ) * b1 );
7685 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
7687 for( ++k; k<kend; ++k ) {
7689 xmm1 +=
set( A(i ,k) ) * b1;
7690 xmm2 +=
set( A(i+1UL,k) ) * b1;
7693 C.store( i , j, C.load(i ,j) + xmm1 * factor );
7694 C.store( i+1UL, j, C.load(i+1UL,j) + xmm2 * factor );
7700 const size_t kbegin( ( IsUpper_v<MT4> )
7701 ?( ( IsLower_v<MT5> )
7702 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7703 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7704 :( IsLower_v<MT5> ? j : 0UL ) );
7710 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
7712 for( ++k; k<K; ++k ) {
7713 xmm1 +=
set( A(i,k) ) * B.load(k,j);
7716 C.store( i, j, C.load(i,j) + xmm1 * factor );
7721 for( ; remainder && j<N; ++j )
7723 const size_t iend( UPP ? j+1UL : M );
7724 size_t i( LOW ? j : 0UL );
7726 for( ; (i+2UL) <= iend; i+=2UL )
7728 const size_t kbegin( ( IsUpper_v<MT4> )
7729 ?( ( IsLower_v<MT5> )
7730 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7731 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7732 :( IsLower_v<MT5> ? j : 0UL ) );
7733 const size_t kend( ( IsLower_v<MT4> )
7734 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7744 for( ++k; k<kend; ++k ) {
7745 value1 += A(i ,k) * B(k,j);
7746 value2 += A(i+1UL,k) * B(k,j);
7749 C(i ,j) += value1 * scalar;
7750 C(i+1UL,j) += value2 * scalar;
7756 const size_t kbegin( ( IsUpper_v<MT4> )
7757 ?( ( IsLower_v<MT5> )
7758 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7759 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7760 :( IsLower_v<MT5> ? j : 0UL ) );
7768 for( ++k; k<K; ++k ) {
7769 value += A(i,k) * B(k,j);
7772 C(i,j) += value * scalar;
7794 template<
typename MT3
7798 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7799 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7806 const ForwardFunctor fwd;
7808 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7809 const OppositeType_t<MT4> tmp(
serial( A ) );
7810 addAssign( C, fwd( tmp * B ) * scalar );
7812 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7813 const OppositeType_t<MT5> tmp(
serial( B ) );
7814 addAssign( C, fwd( A * tmp ) * scalar );
7816 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7817 const OppositeType_t<MT4> tmp(
serial( A ) );
7818 addAssign( C, fwd( tmp * B ) * scalar );
7821 const OppositeType_t<MT5> tmp(
serial( B ) );
7822 addAssign( C, fwd( A * tmp ) * scalar );
7841 template<
typename MT3
7845 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7846 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7848 selectDefaultAddAssignKernel( C, A, B, scalar );
7867 template<
typename MT3
7871 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7872 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7875 lmmm( C, A, B, scalar, ST2(1) );
7877 ummm( C, A, B, scalar, ST2(1) );
7879 mmm( C, A, B, scalar, ST2(1) );
7897 template<
typename MT3
7901 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7902 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7904 selectLargeAddAssignKernel( C, A, B, scalar );
7909#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
7923 template<
typename MT3
7927 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7928 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7930 using ET = ElementType_t<MT3>;
7932 if( IsTriangular_v<MT4> ) {
7933 ResultType_t<MT3> tmp(
serial( B ) );
7934 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7935 addAssign( C, tmp );
7937 else if( IsTriangular_v<MT5> ) {
7938 ResultType_t<MT3> tmp(
serial( A ) );
7939 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7940 addAssign( C, tmp );
7943 gemm( C, A, B,
ET(scalar),
ET(1) );
7963 template<
typename MT >
7965 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7974 const ForwardFunctor fwd;
7976 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
7977 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
7979 addAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
7999 template<
typename MT
8001 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8002 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8009 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8010 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8012 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
8026 DMatScalarMultExpr::selectSubAssignKernel( *lhs, A, B, rhs.scalar_ );
8041 template<
typename MT3
8045 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8047 if( ( IsDiagonal_v<MT5> ) ||
8048 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
8049 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
8050 selectSmallSubAssignKernel( C, A, B, scalar );
8052 selectBlasSubAssignKernel( C, A, B, scalar );
8070 template<
typename MT3
8074 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8075 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8078 subAssign( C, tmp );
8096 template<
typename MT3
8100 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8101 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8105 const size_t M( A.rows() );
8106 const size_t N( B.columns() );
8108 for(
size_t i=0UL; i<M; ++i )
8110 const size_t jbegin( ( IsUpper_v<MT4> )
8111 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
8113 const size_t jend( ( IsLower_v<MT4> )
8114 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
8118 const size_t jnum( jend - jbegin );
8119 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
8122 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
8123 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
8124 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
8127 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
8147 template<
typename MT3
8151 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8152 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
8156 const size_t M( A.rows() );
8157 const size_t N( B.columns() );
8159 for(
size_t i=0UL; i<M; ++i )
8161 const size_t jbegin( ( IsUpper_v<MT5> )
8162 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
8164 const size_t jend( ( IsLower_v<MT5> )
8165 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
8169 const size_t jnum( jend - jbegin );
8170 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
8173 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
8174 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
8175 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
8178 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
8198 template<
typename MT3
8202 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8203 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
8207 for(
size_t i=0UL; i<A.rows(); ++i ) {
8208 C(i,i) -= A(i,i) * B(i,i) * scalar;
8227 template<
typename MT3
8231 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8232 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8234 selectDefaultSubAssignKernel( C, A, B, scalar );
8253 template<
typename MT3
8257 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8258 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8260 constexpr bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
8262 const size_t M( A.rows() );
8263 const size_t N( B.columns() );
8264 const size_t K( A.columns() );
8271 const SIMDType factor(
set( scalar ) );
8275 if( IsIntegral_v<ElementType> )
8278 for(
size_t i=0UL; i<M; ++i )
8280 const size_t kbegin( ( IsUpper_v<MT4> )
8281 ?( ( IsLower_v<MT5> )
8282 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8283 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8284 :( IsLower_v<MT5> ? j : 0UL ) );
8285 const size_t kend( ( IsLower_v<MT4> )
8286 ?( ( IsUpper_v<MT5> )
8287 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
8288 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
8289 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
8295 SIMDType a1(
set( A(i,k) ) );
8296 SIMDType xmm1( a1 * B.load(k,j ) );
8297 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8298 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8299 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8300 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
8301 SIMDType xmm6( a1 * B.load(k,j+
SIMDSIZE*5UL) );
8302 SIMDType xmm7( a1 * B.load(k,j+
SIMDSIZE*6UL) );
8303 SIMDType xmm8( a1 * B.load(k,j+
SIMDSIZE*7UL) );
8305 for( ++k; k<kend; ++k ) {
8307 xmm1 += a1 * B.load(k,j );
8308 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8309 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8310 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8311 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8312 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
8313 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
8314 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
8317 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8334 for( ; (i+2UL) <= M; i+=2UL )
8336 const size_t kbegin( ( IsUpper_v<MT4> )
8337 ?( ( IsLower_v<MT5> )
8338 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8339 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8340 :( IsLower_v<MT5> ? j : 0UL ) );
8341 const size_t kend( ( IsLower_v<MT4> )
8342 ?( ( IsUpper_v<MT5> )
8343 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
8344 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8345 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
8351 SIMDType a1(
set( A(i ,k) ) );
8352 SIMDType a2(
set( A(i+1UL,k) ) );
8353 SIMDType b1( B.load(k,j ) );
8354 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8355 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8356 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8357 SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
8358 SIMDType xmm1 ( a1 * b1 );
8359 SIMDType xmm2 ( a1 * b2 );
8360 SIMDType xmm3 ( a1 * b3 );
8361 SIMDType xmm4 ( a1 * b4 );
8362 SIMDType xmm5 ( a1 * b5 );
8363 SIMDType xmm6 ( a2 * b1 );
8364 SIMDType xmm7 ( a2 * b2 );
8365 SIMDType xmm8 ( a2 * b3 );
8366 SIMDType xmm9 ( a2 * b4 );
8367 SIMDType xmm10( a2 * b5 );
8369 for( ++k; k<kend; ++k ) {
8370 a1 =
set( A(i ,k) );
8371 a2 =
set( A(i+1UL,k) );
8389 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8394 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
8396 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
8397 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
8398 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
8404 const size_t kbegin( ( IsUpper_v<MT4> )
8405 ?( ( IsLower_v<MT5> )
8406 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8407 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8408 :( IsLower_v<MT5> ? j : 0UL ) );
8409 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
8415 SIMDType a1(
set( A(i,k) ) );
8416 SIMDType xmm1( a1 * B.load(k,j ) );
8417 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8418 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8419 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8420 SIMDType xmm5( a1 * B.load(k,j+
SIMDSIZE*4UL) );
8422 for( ++k; k<kend; ++k ) {
8424 xmm1 += a1 * B.load(k,j );
8425 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8426 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8427 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8428 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
8431 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8444 for( ; (i+2UL) <= M; i+=2UL )
8446 const size_t kbegin( ( IsUpper_v<MT4> )
8447 ?( ( IsLower_v<MT5> )
8448 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8449 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8450 :( IsLower_v<MT5> ? j : 0UL ) );
8451 const size_t kend( ( IsLower_v<MT4> )
8452 ?( ( IsUpper_v<MT5> )
8453 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
8454 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8455 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
8461 SIMDType a1(
set( A(i ,k) ) );
8462 SIMDType a2(
set( A(i+1UL,k) ) );
8463 SIMDType b1( B.load(k,j ) );
8464 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8465 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8466 SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
8467 SIMDType xmm1( a1 * b1 );
8468 SIMDType xmm2( a1 * b2 );
8469 SIMDType xmm3( a1 * b3 );
8470 SIMDType xmm4( a1 * b4 );
8471 SIMDType xmm5( a2 * b1 );
8472 SIMDType xmm6( a2 * b2 );
8473 SIMDType xmm7( a2 * b3 );
8474 SIMDType xmm8( a2 * b4 );
8476 for( ++k; k<kend; ++k ) {
8477 a1 =
set( A(i ,k) );
8478 a2 =
set( A(i+1UL,k) );
8493 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8497 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
8499 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
8500 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
8506 const size_t kbegin( ( IsUpper_v<MT4> )
8507 ?( ( IsLower_v<MT5> )
8508 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8509 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8510 :( IsLower_v<MT5> ? j : 0UL ) );
8511 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
8517 SIMDType a1(
set( A(i,k) ) );
8518 SIMDType xmm1( a1 * B.load(k,j ) );
8519 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8520 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8521 SIMDType xmm4( a1 * B.load(k,j+
SIMDSIZE*3UL) );
8523 for( ++k; k<kend; ++k ) {
8525 xmm1 += a1 * B.load(k,j );
8526 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8527 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8528 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8531 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8543 for( ; (i+2UL) <= M; i+=2UL )
8545 const size_t kbegin( ( IsUpper_v<MT4> )
8546 ?( ( IsLower_v<MT5> )
8547 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8548 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8549 :( IsLower_v<MT5> ? j : 0UL ) );
8550 const size_t kend( ( IsLower_v<MT4> )
8551 ?( ( IsUpper_v<MT5> )
8552 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
8553 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8554 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
8560 SIMDType a1(
set( A(i ,k) ) );
8561 SIMDType a2(
set( A(i+1UL,k) ) );
8562 SIMDType b1( B.load(k,j ) );
8563 SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8564 SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8565 SIMDType xmm1( a1 * b1 );
8566 SIMDType xmm2( a1 * b2 );
8567 SIMDType xmm3( a1 * b3 );
8568 SIMDType xmm4( a2 * b1 );
8569 SIMDType xmm5( a2 * b2 );
8570 SIMDType xmm6( a2 * b3 );
8572 for( ++k; k<kend; ++k ) {
8573 a1 =
set( A(i ,k) );
8574 a2 =
set( A(i+1UL,k) );
8586 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8589 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
8591 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
8597 const size_t kbegin( ( IsUpper_v<MT4> )
8598 ?( ( IsLower_v<MT5> )
8599 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8600 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8601 :( IsLower_v<MT5> ? j : 0UL ) );
8602 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
8608 SIMDType a1(
set( A(i,k) ) );
8609 SIMDType xmm1( a1 * B.load(k,j ) );
8610 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE ) );
8611 SIMDType xmm3( a1 * B.load(k,j+
SIMDSIZE*2UL) );
8613 for( ++k; k<kend; ++k ) {
8615 xmm1 += a1 * B.load(k,j );
8616 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8617 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8620 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8629 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
8630 size_t i( LOW ? j : 0UL );
8632 for( ; (i+4UL) <= iend; i+=4UL )
8634 const size_t kbegin( ( IsUpper_v<MT4> )
8635 ?( ( IsLower_v<MT5> )
8636 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8637 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8638 :( IsLower_v<MT5> ? j : 0UL ) );
8639 const size_t kend( ( IsLower_v<MT4> )
8640 ?( ( IsUpper_v<MT5> )
8641 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
8642 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8643 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8649 SIMDType a1(
set( A(i ,k) ) );
8650 SIMDType a2(
set( A(i+1UL,k) ) );
8651 SIMDType a3(
set( A(i+2UL,k) ) );
8652 SIMDType a4(
set( A(i+3UL,k) ) );
8653 SIMDType b1( B.load(k,j ) );
8654 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8655 SIMDType xmm1( a1 * b1 );
8656 SIMDType xmm2( a1 * b2 );
8657 SIMDType xmm3( a2 * b1 );
8658 SIMDType xmm4( a2 * b2 );
8659 SIMDType xmm5( a3 * b1 );
8660 SIMDType xmm6( a3 * b2 );
8661 SIMDType xmm7( a4 * b1 );
8662 SIMDType xmm8( a4 * b2 );
8664 for( ++k; k<kend; ++k ) {
8665 a1 =
set( A(i ,k) );
8666 a2 =
set( A(i+1UL,k) );
8667 a3 =
set( A(i+2UL,k) );
8668 a4 =
set( A(i+3UL,k) );
8681 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8683 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8685 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8687 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
8692 for( ; (i+3UL) <= iend; i+=3UL )
8694 const size_t kbegin( ( IsUpper_v<MT4> )
8695 ?( ( IsLower_v<MT5> )
8696 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8697 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8698 :( IsLower_v<MT5> ? j : 0UL ) );
8699 const size_t kend( ( IsLower_v<MT4> )
8700 ?( ( IsUpper_v<MT5> )
8701 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
8702 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8703 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8709 SIMDType a1(
set( A(i ,k) ) );
8710 SIMDType a2(
set( A(i+1UL,k) ) );
8711 SIMDType a3(
set( A(i+2UL,k) ) );
8712 SIMDType b1( B.load(k,j ) );
8713 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8714 SIMDType xmm1( a1 * b1 );
8715 SIMDType xmm2( a1 * b2 );
8716 SIMDType xmm3( a2 * b1 );
8717 SIMDType xmm4( a2 * b2 );
8718 SIMDType xmm5( a3 * b1 );
8719 SIMDType xmm6( a3 * b2 );
8721 for( ++k; k<kend; ++k ) {
8722 a1 =
set( A(i ,k) );
8723 a2 =
set( A(i+1UL,k) );
8724 a3 =
set( A(i+2UL,k) );
8735 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8737 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8739 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8744 for( ; (i+2UL) <= iend; i+=2UL )
8746 const size_t kbegin( ( IsUpper_v<MT4> )
8747 ?( ( IsLower_v<MT5> )
8748 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8749 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8750 :( IsLower_v<MT5> ? j : 0UL ) );
8751 const size_t kend( ( IsLower_v<MT4> )
8752 ?( ( IsUpper_v<MT5> )
8753 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
8754 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8755 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8761 SIMDType a1(
set( A(i ,k) ) );
8762 SIMDType a2(
set( A(i+1UL,k) ) );
8763 SIMDType b1( B.load(k,j ) );
8764 SIMDType b2( B.load(k,j+
SIMDSIZE) );
8765 SIMDType xmm1( a1 * b1 );
8766 SIMDType xmm2( a1 * b2 );
8767 SIMDType xmm3( a2 * b1 );
8768 SIMDType xmm4( a2 * b2 );
8770 for( ++k; k<kend; ++k ) {
8771 a1 =
set( A(i ,k) );
8772 a2 =
set( A(i+1UL,k) );
8781 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8783 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8790 const size_t kbegin( ( IsUpper_v<MT4> )
8791 ?( ( IsLower_v<MT5> )
8792 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8793 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8794 :( IsLower_v<MT5> ? j : 0UL ) );
8795 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8801 SIMDType a1(
set( A(i,k) ) );
8802 SIMDType xmm1( a1 * B.load(k,j ) );
8803 SIMDType xmm2( a1 * B.load(k,j+
SIMDSIZE) );
8805 for( ++k; k<kend; ++k ) {
8807 xmm1 += a1 * B.load(k,j );
8811 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8819 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
8820 size_t i( LOW ? j : 0UL );
8822 for( ; (i+4UL) <= iend; i+=4UL )
8824 const size_t kbegin( ( IsUpper_v<MT4> )
8825 ?( ( IsLower_v<MT5> )
8826 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8827 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8828 :( IsLower_v<MT5> ? j : 0UL ) );
8829 const size_t kend( ( IsLower_v<MT4> )
8830 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8837 SIMDType b1( B.load(k,j) );
8838 SIMDType xmm1(
set( A(i ,k) ) * b1 );
8839 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
8840 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
8841 SIMDType xmm4(
set( A(i+3UL,k) ) * b1 );
8843 for( ++k; k<kend; ++k ) {
8845 xmm1 +=
set( A(i ,k) ) * b1;
8846 xmm2 +=
set( A(i+1UL,k) ) * b1;
8847 xmm3 +=
set( A(i+2UL,k) ) * b1;
8848 xmm4 +=
set( A(i+3UL,k) ) * b1;
8851 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8852 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8853 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
8854 C.store( i+3UL, j, C.load(i+3UL,j) - xmm4 * factor );
8858 for( ; (i+3UL) <= iend; i+=3UL )
8860 const size_t kbegin( ( IsUpper_v<MT4> )
8861 ?( ( IsLower_v<MT5> )
8862 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8863 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8864 :( IsLower_v<MT5> ? j : 0UL ) );
8865 const size_t kend( ( IsLower_v<MT4> )
8866 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8873 SIMDType b1( B.load(k,j) );
8874 SIMDType xmm1(
set( A(i ,k) ) * b1 );
8875 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
8876 SIMDType xmm3(
set( A(i+2UL,k) ) * b1 );
8878 for( ++k; k<kend; ++k ) {
8880 xmm1 +=
set( A(i ,k) ) * b1;
8881 xmm2 +=
set( A(i+1UL,k) ) * b1;
8882 xmm3 +=
set( A(i+2UL,k) ) * b1;
8885 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8886 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8887 C.store( i+2UL, j, C.load(i+2UL,j) - xmm3 * factor );
8891 for( ; (i+2UL) <= iend; i+=2UL )
8893 const size_t kbegin( ( IsUpper_v<MT4> )
8894 ?( ( IsLower_v<MT5> )
8895 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8896 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8897 :( IsLower_v<MT5> ? j : 0UL ) );
8898 const size_t kend( ( IsLower_v<MT4> )
8899 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8906 SIMDType b1( B.load(k,j) );
8907 SIMDType xmm1(
set( A(i ,k) ) * b1 );
8908 SIMDType xmm2(
set( A(i+1UL,k) ) * b1 );
8910 for( ++k; k<kend; ++k ) {
8912 xmm1 +=
set( A(i ,k) ) * b1;
8913 xmm2 +=
set( A(i+1UL,k) ) * b1;
8916 C.store( i , j, C.load(i ,j) - xmm1 * factor );
8917 C.store( i+1UL, j, C.load(i+1UL,j) - xmm2 * factor );
8923 const size_t kbegin( ( IsUpper_v<MT4> )
8924 ?( ( IsLower_v<MT5> )
8925 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8926 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8927 :( IsLower_v<MT5> ? j : 0UL ) );
8933 SIMDType xmm1(
set( A(i,k) ) * B.load(k,j) );
8935 for( ++k; k<K; ++k ) {
8936 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8939 C.store( i, j, C.load(i,j) - xmm1 * factor );
8944 for( ; remainder && j<N; ++j )
8946 const size_t iend( UPP ? j+1UL : M );
8947 size_t i( LOW ? j : 0UL );
8949 for( ; (i+2UL) <= iend; i+=2UL )
8951 const size_t kbegin( ( IsUpper_v<MT4> )
8952 ?( ( IsLower_v<MT5> )
8953 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8954 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8955 :( IsLower_v<MT5> ? j : 0UL ) );
8956 const size_t kend( ( IsLower_v<MT4> )
8957 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8967 for( ++k; k<kend; ++k ) {
8968 value1 += A(i ,k) * B(k,j);
8969 value2 += A(i+1UL,k) * B(k,j);
8972 C(i ,j) -= value1 * scalar;
8973 C(i+1UL,j) -= value2 * scalar;
8979 const size_t kbegin( ( IsUpper_v<MT4> )
8980 ?( ( IsLower_v<MT5> )
8981 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8982 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8983 :( IsLower_v<MT5> ? j : 0UL ) );
8991 for( ++k; k<K; ++k ) {
8992 value += A(i,k) * B(k,j);
8995 C(i,j) -= value * scalar;
9017 template<
typename MT3
9021 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9022 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9029 const ForwardFunctor fwd;
9031 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
9032 const OppositeType_t<MT4> tmp(
serial( A ) );
9033 subAssign( C, fwd( tmp * B ) * scalar );
9035 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
9036 const OppositeType_t<MT5> tmp(
serial( B ) );
9037 subAssign( C, fwd( A * tmp ) * scalar );
9039 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
9040 const OppositeType_t<MT4> tmp(
serial( A ) );
9041 subAssign( C, fwd( tmp * B ) * scalar );
9044 const OppositeType_t<MT5> tmp(
serial( B ) );
9045 subAssign( C, fwd( A * tmp ) * scalar );
9064 template<
typename MT3
9068 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9069 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9071 selectDefaultSubAssignKernel( C, A, B, scalar );
9090 template<
typename MT3
9094 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9095 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
9098 lmmm( C, A, B, -scalar, ST2(1) );
9100 ummm( C, A, B, -scalar, ST2(1) );
9102 mmm( C, A, B, -scalar, ST2(1) );
9120 template<
typename MT3
9124 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9125 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9127 selectLargeSubAssignKernel( C, A, B, scalar );
9132#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION
9146 template<
typename MT3
9150 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
9151 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
9153 using ET = ElementType_t<MT3>;
9155 if( IsTriangular_v<MT4> ) {
9156 ResultType_t<MT3> tmp(
serial( B ) );
9157 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9158 subAssign( C, tmp );
9160 else if( IsTriangular_v<MT5> ) {
9161 ResultType_t<MT3> tmp(
serial( A ) );
9162 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
9163 subAssign( C, tmp );
9166 gemm( C, A, B,
ET(-scalar),
ET(1) );
9186 template<
typename MT >
9188 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9197 const ForwardFunctor fwd;
9199 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9200 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9202 subAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9222 template<
typename MT
9224 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
9236 schurAssign( *lhs, tmp );
9267 template<
typename MT
9270 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9277 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9278 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9280 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL ) {
9283 else if( left.columns() == 0UL ) {
9317 template<
typename MT
9320 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9324 using TmpType = If_t< SO, OppositeType, ResultType >;
9336 const ForwardFunctor fwd;
9338 const TmpType tmp( rhs );
9357 template<
typename MT >
9359 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9368 const ForwardFunctor fwd;
9370 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9371 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9373 smpAssign( *lhs, fwd( A * B ) * rhs.scalar_ );
9392 template<
typename MT
9395 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9402 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9403 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9405 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9437 template<
typename MT >
9439 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9448 const ForwardFunctor fwd;
9450 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9451 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9476 template<
typename MT
9479 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
9486 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
9487 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
9489 if( (*lhs).rows() == 0UL || (*lhs).columns() == 0UL || left.columns() == 0UL ) {
9521 template<
typename MT >
9523 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
9532 const ForwardFunctor fwd;
9534 decltype(
auto) A(
transIf< IsSymmetric_v<MT1> >( rhs.matrix_.leftOperand() ) );
9535 decltype(
auto) B(
transIf< IsSymmetric_v<MT2> >( rhs.matrix_.rightOperand() ) );
9557 template<
typename MT
9637template<
typename MT1
9639inline decltype(
auto)
9644 if( (*lhs).columns() != (*rhs).rows() ) {
9649 return ReturnType( *lhs, *rhs );
9685template<
typename MT1
9691inline decltype(
auto)
declsym(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9699 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9700 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9729template<
typename MT1
9735inline decltype(
auto)
declherm(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9743 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9744 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9773template<
typename MT1
9779inline decltype(
auto)
decllow(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9787 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9788 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9817template<
typename MT1
9822inline decltype(
auto)
declunilow(
const DMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9859template<
typename MT1
9864inline decltype(
auto)
declstrlow(
const DMatDMatMultExpr<MT1,MT2,SF,HF,false,UF>& dm )
9901template<
typename MT1
9907inline decltype(
auto)
declupp(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9915 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9916 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9945template<
typename MT1
9950inline decltype(
auto)
decluniupp(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
9987template<
typename MT1
9992inline decltype(
auto)
declstrupp(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,false>& dm )
10029template<
typename MT1
10035inline decltype(
auto)
decldiag(
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
10043 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
10044 return ReturnType( dm.leftOperand(), dm.rightOperand() );
10060template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10061struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
10062 :
public Size<MT1,0UL>
10065template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10066struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
10067 :
public Size<MT2,1UL>
10083template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
10084struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
10085 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.
Definition: Aliases.h:310
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for kernel specific block sizes.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the conjugate shim.
Header file for the decldiag trait.
Header file for the DeclDiag functor.
Header file for the declherm trait.
Header file for the DeclHerm functor.
Header file for the decllow trait.
Header file for the DeclLow functor.
Header file for the declsym trait.
Header file for the DeclSym functor.
Header file for the declupp trait.
Header file for the DeclUpp functor.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsColumnMajorMatrix type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsIntegral type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsResizable type trait.
Header file for the IsRowMajorMatrix type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Header file for the dense matrix multiplication kernels.
Header file for the multiplication trait.
Header file for the Noop functor.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Constraint on the data type.
Constraint on the data type.
Expression object for dense matrix-dense matrix multiplications.
Definition: DMatDMatMultExpr.h:154
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:346
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:291
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:309
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:322
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:331
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:497
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:304
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:453
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:485
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:395
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:179
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:176
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:177
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:290
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:172
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:298
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:295
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:167
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:287
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:178
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:292
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:441
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:411
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:421
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:316
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:285
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:301
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:475
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:465
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:431
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:289
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:592
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:548
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:170
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:108
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:602
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:167
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:176
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:159
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:570
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:538
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:106
MatScalarMultExpr< DenseMatrix< This, SO > > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:162
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:179
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:173
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:611
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:427
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:558
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:437
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:446
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:582
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:164
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:528
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:459
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:166
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:610
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:432
Base class for dense matrices.
Definition: DenseMatrix.h:82
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseMatrix base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the MatScalarMultExpr base class.
Header file for the SparseVector base class.
Header file for BLAS general matrix/matrix multiplication functions (gemm)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) transIf(const DenseMatrix< MT, SO > &dm)
Conditional calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:832
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) declstrupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly upper.
Definition: DMatDeclStrUppExpr.h:1003
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1464
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:978
decltype(auto) declstrlow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as strictly lower.
Definition: DMatDeclStrLowExpr.h:1003
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1004
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1004
decltype(auto) decluniupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as uniupper.
Definition: DMatDeclUniUppExpr.h:1005
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1005
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1005
decltype(auto) declunilow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as unilower.
Definition: DMatDeclUniLowExpr.h:1004
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.
Definition: StorageOrder.h:84
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatMatMultExpr.h:103
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:584
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:1383
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
typename EnableIf<!Condition, T >::Type DisableIf_t
Auxiliary type for the EnableIf class template.
Definition: EnableIf.h:175
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
constexpr decltype(auto) zero(size_t m, size_t n) noexcept
Creating a zero matrix.
Definition: ZeroMatrix.h:1356
Header file for the exception macros of the math module.
Constraints on the storage order of matrix types.
Header file for all forward declarations for expression class templates.
Header file for the Size type trait.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:127
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:61
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:126
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:61
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:126
Generic wrapper for the decllow() function.
Definition: DeclLow.h:61
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:126
Generic wrapper for the declsym() function.
Definition: DeclSym.h:61
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:126
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:61
Base class for all matrix/matrix multiplication expression templates.
Definition: MatMatMultExpr.h:71
Base template for the MultTrait class.
Definition: MultTrait.h:130
Generic wrapper for the null function.
Definition: Noop.h:62
System settings for the BLAS mode.
System settings for the debugging policy of the Blaze library.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.