35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDMATMULTEXPR_H_ 145 template<
typename MT1
152 :
public MatMatMultExpr< DenseMatrix< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, false > >
167 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
172 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
176 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
177 static constexpr
bool HERM = ( HF && !( LF || UF ) );
178 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
179 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
189 template<
typename T1,
typename T2,
typename T3 >
190 static constexpr
bool CanExploitSymmetry_v =
191 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
201 template<
typename T1,
typename T2,
typename T3 >
202 static constexpr
bool IsEvaluationRequired_v =
212 template<
typename T1,
typename T2,
typename T3 >
213 static constexpr
bool UseBlasKernel_v =
216 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
217 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
218 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
219 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
220 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
221 IsBLASCompatible_v< ElementType_t<T1> > &&
222 IsBLASCompatible_v< ElementType_t<T2> > &&
223 IsBLASCompatible_v< ElementType_t<T3> > &&
234 template<
typename T1,
typename T2,
typename T3 >
235 static constexpr
bool UseVectorizedDefaultKernel_v =
236 ( useOptimizedKernels &&
237 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
238 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
239 IsSIMDCombinable_v< ElementType_t<T1>
310 ( !IsDiagonal_v<MT2> &&
311 MT1::simdEnabled && MT2::simdEnabled &&
312 HasSIMDAdd_v<ET1,ET2> &&
313 HasSIMDMult_v<ET1,ET2> );
350 if( IsDiagonal_v<MT1> ) {
353 else if( IsDiagonal_v<MT2> ) {
356 else if( IsTriangular_v<MT1> || IsTriangular_v<MT2> ) {
357 const size_t begin( ( IsUpper_v<MT1> )
358 ?( ( IsLower_v<MT2> )
359 ?(
max( ( IsStrictlyUpper_v<MT1> ? i+1UL : i )
360 , ( IsStrictlyLower_v<MT2> ? j+1UL : j ) ) )
361 :( IsStrictlyUpper_v<MT1> ? i+1UL : i ) )
362 :( ( IsLower_v<MT2> )
363 ?( IsStrictlyLower_v<MT2> ? j+1UL : j )
365 const size_t end( ( IsLower_v<MT1> )
366 ?( ( IsUpper_v<MT2> )
367 ?(
min( ( IsStrictlyLower_v<MT1> ? i : i+1UL )
368 , ( IsStrictlyUpper_v<MT2> ? j : j+1UL ) ) )
369 :( IsStrictlyLower_v<MT1> ? i : i+1UL ) )
370 :( ( IsUpper_v<MT2> )
371 ?( IsStrictlyUpper_v<MT2> ? j : j+1UL )
372 :(
lhs_.columns() ) ) );
396 if( i >=
lhs_.rows() ) {
399 if( j >=
rhs_.columns() ) {
411 inline size_t rows() const noexcept {
422 return rhs_.columns();
452 template<
typename T >
453 inline bool canAlias(
const T* alias )
const noexcept {
454 return (
lhs_.canAlias( alias ) ||
rhs_.canAlias( alias ) );
464 template<
typename T >
465 inline bool isAliased(
const T* alias )
const noexcept {
466 return (
lhs_.isAliased( alias ) ||
rhs_.isAliased( alias ) );
476 return lhs_.isAligned() &&
rhs_.isAligned();
487 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
489 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
490 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD ) &&
491 !IsDiagonal_v<MT1> && !IsDiagonal_v<MT2>;
514 template<
typename MT
524 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
527 else if( rhs.lhs_.columns() == 0UL ) {
542 DMatDMatMultExpr::selectAssignKernel( ~lhs, A, B );
558 template<
typename MT3
561 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
563 if( ( IsDiagonal_v<MT5> ) ||
564 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
565 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
566 selectSmallAssignKernel( C, A, B );
568 selectBlasAssignKernel( C, A, B );
587 template<
typename MT3
590 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
591 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
593 const size_t M( A.rows() );
594 const size_t N( B.columns() );
595 const size_t K( A.columns() );
599 for(
size_t i=0UL; i<M; ++i )
601 const size_t kbegin( ( IsUpper_v<MT4> )
602 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
604 const size_t kend( ( IsLower_v<MT4> )
605 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
609 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
610 for(
size_t j=0UL; j<N; ++j ) {
617 const size_t jbegin( ( IsUpper_v<MT5> )
618 ?( ( IsStrictlyUpper_v<MT5> )
619 ?(
UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
620 :(
UPP ?
max(i,kbegin) : kbegin ) )
621 :(
UPP ? i : 0UL ) );
622 const size_t jend( ( IsLower_v<MT5> )
623 ?( ( IsStrictlyLower_v<MT5> )
624 ?(
LOW ?
min(i+1UL,kbegin) : kbegin )
625 :(
LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
626 :(
LOW ? i+1UL : N ) );
628 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) ||
UPP ) {
629 for(
size_t j=0UL; j<jbegin; ++j ) {
633 else if( IsStrictlyUpper_v<MT5> ) {
636 for(
size_t j=jbegin; j<jend; ++j ) {
637 C(i,j) = A(i,kbegin) * B(kbegin,j);
639 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) ||
LOW ) {
640 for(
size_t j=jend; j<N; ++j ) {
644 else if( IsStrictlyLower_v<MT5> ) {
649 for(
size_t k=kbegin+1UL; k<kend; ++k )
651 const size_t jbegin( ( IsUpper_v<MT5> )
652 ?( ( IsStrictlyUpper_v<MT5> )
656 const size_t jend( ( IsLower_v<MT5> )
657 ?( ( IsStrictlyLower_v<MT5> )
658 ?(
LOW ?
min(i+1UL,k-1UL) : k-1UL )
659 :(
LOW ?
min(i+1UL,k) : k ) )
660 :(
LOW ? i+1UL : N ) );
662 if( (
SYM ||
HERM ||
LOW ||
UPP ) && ( jbegin > jend ) )
continue;
665 for(
size_t j=jbegin; j<jend; ++j ) {
666 C(i,j) += A(i,k) * B(k,j);
668 if( IsLower_v<MT5> ) {
669 C(i,jend) = A(i,k) * B(k,jend);
675 for(
size_t i=1UL; i<M; ++i ) {
676 for(
size_t j=0UL; j<i; ++j ) {
677 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
699 template<
typename MT3
702 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
703 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
707 const size_t M( A.rows() );
708 const size_t N( B.columns() );
710 for(
size_t i=0UL; i<M; ++i )
712 const size_t jbegin( ( IsUpper_v<MT4> )
713 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
715 const size_t jend( ( IsLower_v<MT4> )
716 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
720 if( IsUpper_v<MT4> ) {
721 for(
size_t j=0UL; j<jbegin; ++j ) {
725 for(
size_t j=jbegin; j<jend; ++j ) {
726 C(i,j) = A(i,j) * B(j,j);
728 if( IsLower_v<MT4> ) {
729 for(
size_t j=jend; j<N; ++j ) {
752 template<
typename MT3
755 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
756 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
760 const size_t M( A.rows() );
761 const size_t N( B.columns() );
763 for(
size_t i=0UL; i<M; ++i )
765 const size_t jbegin( ( IsUpper_v<MT5> )
766 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
768 const size_t jend( ( IsLower_v<MT5> )
769 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
773 if( IsUpper_v<MT5> ) {
774 for(
size_t j=0UL; j<jbegin; ++j ) {
778 for(
size_t j=jbegin; j<jend; ++j ) {
779 C(i,j) = A(i,i) * B(i,j);
781 if( IsLower_v<MT5> ) {
782 for(
size_t j=jend; j<N; ++j ) {
805 template<
typename MT3
808 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
809 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
815 for(
size_t i=0UL; i<A.rows(); ++i ) {
816 C(i,i) = A(i,i) * B(i,i);
835 template<
typename MT3
838 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
839 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
841 selectDefaultAssignKernel( C, A, B );
861 template<
typename MT3
864 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
865 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
867 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
869 const size_t M( A.rows() );
870 const size_t N( B.columns() );
871 const size_t K( A.columns() );
875 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
880 if( IsIntegral_v<ElementType> )
883 for(
size_t i=0UL; i<M; ++i )
885 const size_t kbegin( ( IsUpper_v<MT4> )
886 ?( ( IsLower_v<MT5> )
887 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
888 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
889 :( IsLower_v<MT5> ? j : 0UL ) );
890 const size_t kend( ( IsLower_v<MT4> )
891 ?( ( IsUpper_v<MT5> )
892 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
893 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
894 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
896 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
898 for(
size_t k=kbegin; k<kend; ++k ) {
900 xmm1 += a1 * B.load(k,j );
902 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
903 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
904 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
905 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
906 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
907 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
910 C.store( i, j , xmm1 );
926 for( ; (i+2UL) <= M; i+=2UL )
928 const size_t kbegin( ( IsUpper_v<MT4> )
929 ?( ( IsLower_v<MT5> )
930 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
931 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
932 :( IsLower_v<MT5> ? j : 0UL ) );
933 const size_t kend( ( IsLower_v<MT4> )
934 ?( ( IsUpper_v<MT5> )
935 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
936 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
937 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
939 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
941 for(
size_t k=kbegin; k<kend; ++k ) {
961 C.store( i , j , xmm1 );
963 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
964 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
965 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
966 C.store( i+1UL, j , xmm6 );
967 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
968 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
969 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
970 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
975 const size_t kbegin( ( IsUpper_v<MT4> )
976 ?( ( IsLower_v<MT5> )
977 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
978 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
979 :( IsLower_v<MT5> ? j : 0UL ) );
980 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
982 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
984 for(
size_t k=kbegin; k<kend; ++k ) {
986 xmm1 += a1 * B.load(k,j );
988 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
989 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
990 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
993 C.store( i, j , xmm1 );
1009 for(
size_t jj=j; jj<jjend; ++jj ) {
1010 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1017 for(
size_t jj=j; jj<jjend; ++jj ) {
1023 for( ; (i+2UL) <= iend; i+=2UL )
1025 const size_t kbegin( ( IsUpper_v<MT4> )
1026 ?( ( IsLower_v<MT5> )
1027 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1028 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1029 :( IsLower_v<MT5> ? j : 0UL ) );
1030 const size_t kend( ( IsLower_v<MT4> )
1031 ?( ( IsUpper_v<MT5> )
1032 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
1033 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1034 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
1036 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1038 for(
size_t k=kbegin; k<kend; ++k ) {
1055 C.store( i , j , xmm1 );
1057 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1058 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
1059 C.store( i+1UL, j , xmm5 );
1060 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
1061 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
1062 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
1067 const size_t kbegin( ( IsUpper_v<MT4> )
1068 ?( ( IsLower_v<MT5> )
1069 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1070 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1071 :( IsLower_v<MT5> ? j : 0UL ) );
1072 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
1076 for(
size_t k=kbegin; k<kend; ++k ) {
1078 xmm1 += a1 * B.load(k,j );
1079 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1080 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1081 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
1084 C.store( i, j , xmm1 );
1086 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1087 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
1095 for(
size_t jj=j; jj<jjend; ++jj ) {
1110 for(
size_t jj=j; jj<jjend; ++jj ) {
1111 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1118 for(
size_t jj=j; jj<jjend; ++jj ) {
1124 for( ; (i+2UL) <= iend; i+=2UL )
1126 const size_t kbegin( ( IsUpper_v<MT4> )
1127 ?( ( IsLower_v<MT5> )
1128 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1129 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1130 :( IsLower_v<MT5> ? j : 0UL ) );
1131 const size_t kend( ( IsLower_v<MT4> )
1132 ?( ( IsUpper_v<MT5> )
1133 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
1134 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1135 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
1137 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1139 for(
size_t k=kbegin; k<kend; ++k ) {
1153 C.store( i , j , xmm1 );
1155 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
1156 C.store( i+1UL, j , xmm4 );
1157 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
1158 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
1163 const size_t kbegin( ( IsUpper_v<MT4> )
1164 ?( ( IsLower_v<MT5> )
1165 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1166 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1167 :( IsLower_v<MT5> ? j : 0UL ) );
1168 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
1172 for(
size_t k=kbegin; k<kend; ++k ) {
1174 xmm1 += a1 * B.load(k,j );
1175 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
1176 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
1179 C.store( i, j , xmm1 );
1181 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
1189 for(
size_t jj=j; jj<jjend; ++jj ) {
1204 for(
size_t jj=j; jj<jjend; ++jj ) {
1205 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1212 for(
size_t jj=j; jj<jjend; ++jj ) {
1218 for( ; (i+4UL) <= iend; i+=4UL )
1220 const size_t kbegin( ( IsUpper_v<MT4> )
1221 ?( ( IsLower_v<MT5> )
1222 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1223 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1224 :( IsLower_v<MT5> ? j : 0UL ) );
1225 const size_t kend( ( IsLower_v<MT4> )
1226 ?( ( IsUpper_v<MT5> )
1227 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
1228 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
1229 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1231 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1233 for(
size_t k=kbegin; k<kend; ++k ) {
1250 C.store( i , j , xmm1 );
1252 C.store( i+1UL, j , xmm3 );
1253 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1254 C.store( i+2UL, j , xmm5 );
1255 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1256 C.store( i+3UL, j , xmm7 );
1257 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
1260 for( ; (i+3UL) <= iend; i+=3UL )
1262 const size_t kbegin( ( IsUpper_v<MT4> )
1263 ?( ( IsLower_v<MT5> )
1264 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1265 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1266 :( IsLower_v<MT5> ? j : 0UL ) );
1267 const size_t kend( ( IsLower_v<MT4> )
1268 ?( ( IsUpper_v<MT5> )
1269 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
1270 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
1271 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1273 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1275 for(
size_t k=kbegin; k<kend; ++k ) {
1289 C.store( i , j , xmm1 );
1291 C.store( i+1UL, j , xmm3 );
1292 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
1293 C.store( i+2UL, j , xmm5 );
1294 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
1297 for( ; (i+2UL) <= iend; i+=2UL )
1299 const size_t kbegin( ( IsUpper_v<MT4> )
1300 ?( ( IsLower_v<MT5> )
1301 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1302 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1303 :( IsLower_v<MT5> ? j : 0UL ) );
1304 const size_t kend( ( IsLower_v<MT4> )
1305 ?( ( IsUpper_v<MT5> )
1306 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
1307 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
1308 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
1310 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1313 for( ; (k+2UL) <= kend; k+=2UL ) {
1318 const SIMDType b1( B.load(k ,j ) );
1320 const SIMDType b3( B.load(k+1UL,j ) );
1332 for( ; k<kend; ++k ) {
1343 C.store( i , j , xmm1+xmm5 );
1344 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
1345 C.store( i+1UL, j , xmm3+xmm7 );
1346 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
1351 const size_t kbegin( ( IsUpper_v<MT4> )
1352 ?( ( IsLower_v<MT5> )
1353 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1354 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1355 :( IsLower_v<MT5> ? j : 0UL ) );
1356 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
1361 for( ; (k+2UL) <= kend; k+=2UL ) {
1364 xmm1 += a1 * B.load(k ,j );
1365 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
1366 xmm3 += a2 * B.load(k+1UL,j );
1367 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
1370 for( ; k<kend; ++k ) {
1372 xmm1 += a1 * B.load(k,j );
1376 C.store( i, j , xmm1+xmm3 );
1377 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
1385 for(
size_t jj=j; jj<jjend; ++jj ) {
1400 for(
size_t jj=j; jj<jjend; ++jj ) {
1401 C(i,jj) =
HERM ?
conj( C(jj,i) ) : C(jj,i);
1408 for(
size_t jj=j; jj<jjend; ++jj ) {
1414 for( ; (i+4UL) <= iend; i+=4UL )
1416 const size_t kbegin( ( IsUpper_v<MT4> )
1417 ?( ( IsLower_v<MT5> )
1418 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1419 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1420 :( IsLower_v<MT5> ? j : 0UL ) );
1421 const size_t kend( ( IsLower_v<MT4> )
1422 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
1425 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1428 for( ; (k+2UL) <= kend; k+=2UL ) {
1430 const SIMDType b2( B.load(k+1UL,j) );
1431 xmm1 +=
set( A(i ,k ) ) * b1;
1432 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1433 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1434 xmm4 +=
set( A(i+3UL,k ) ) * b1;
1435 xmm5 +=
set( A(i ,k+1UL) ) * b2;
1436 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
1437 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
1438 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
1441 for( ; k<kend; ++k ) {
1443 xmm1 +=
set( A(i ,k) ) * b1;
1444 xmm2 +=
set( A(i+1UL,k) ) * b1;
1445 xmm3 +=
set( A(i+2UL,k) ) * b1;
1446 xmm4 +=
set( A(i+3UL,k) ) * b1;
1449 C.store( i , j, xmm1+xmm5 );
1450 C.store( i+1UL, j, xmm2+xmm6 );
1451 C.store( i+2UL, j, xmm3+xmm7 );
1452 C.store( i+3UL, j, xmm4+xmm8 );
1455 for( ; (i+3UL) <= iend; i+=3UL )
1457 const size_t kbegin( ( IsUpper_v<MT4> )
1458 ?( ( IsLower_v<MT5> )
1459 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1460 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1461 :( IsLower_v<MT5> ? j : 0UL ) );
1462 const size_t kend( ( IsLower_v<MT4> )
1463 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
1466 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1469 for( ; (k+2UL) <= kend; k+=2UL ) {
1471 const SIMDType b2( B.load(k+1UL,j) );
1472 xmm1 +=
set( A(i ,k ) ) * b1;
1473 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1474 xmm3 +=
set( A(i+2UL,k ) ) * b1;
1475 xmm4 +=
set( A(i ,k+1UL) ) * b2;
1476 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
1477 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
1480 for( ; k<kend; ++k ) {
1482 xmm1 +=
set( A(i ,k) ) * b1;
1483 xmm2 +=
set( A(i+1UL,k) ) * b1;
1484 xmm3 +=
set( A(i+2UL,k) ) * b1;
1487 C.store( i , j, xmm1+xmm4 );
1488 C.store( i+1UL, j, xmm2+xmm5 );
1489 C.store( i+2UL, j, xmm3+xmm6 );
1492 for( ; (i+2UL) <= iend; i+=2UL )
1494 const size_t kbegin( ( IsUpper_v<MT4> )
1495 ?( ( IsLower_v<MT5> )
1496 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1497 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1498 :( IsLower_v<MT5> ? j : 0UL ) );
1499 const size_t kend( ( IsLower_v<MT4> )
1500 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1506 for( ; (k+2UL) <= kend; k+=2UL ) {
1508 const SIMDType b2( B.load(k+1UL,j) );
1509 xmm1 +=
set( A(i ,k ) ) * b1;
1510 xmm2 +=
set( A(i+1UL,k ) ) * b1;
1511 xmm3 +=
set( A(i ,k+1UL) ) * b2;
1512 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
1515 for( ; k<kend; ++k ) {
1517 xmm1 +=
set( A(i ,k) ) * b1;
1518 xmm2 +=
set( A(i+1UL,k) ) * b1;
1521 C.store( i , j, xmm1+xmm3 );
1522 C.store( i+1UL, j, xmm2+xmm4 );
1527 const size_t kbegin( ( IsUpper_v<MT4> )
1528 ?( ( IsLower_v<MT5> )
1529 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1530 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1531 :( IsLower_v<MT5> ? j : 0UL ) );
1536 for( ; (k+2UL) <= K; k+=2UL ) {
1537 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
1538 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
1542 xmm1 +=
set( A(i,k) ) * B.load(k,j);
1545 C.store( i, j, xmm1+xmm2 );
1553 for(
size_t jj=j; jj<jjend; ++jj ) {
1560 for( ; remainder && j<N; ++j )
1566 C(i,j) =
HERM ?
conj( C(j,i) ) : C(j,i);
1575 for( ; (i+2UL) <= M; i+=2UL )
1577 const size_t kbegin( ( IsUpper_v<MT4> )
1578 ?( ( IsLower_v<MT5> )
1579 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1580 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1581 :( IsLower_v<MT5> ? j : 0UL ) );
1582 const size_t kend( ( IsLower_v<MT4> )
1583 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
1589 for(
size_t k=kbegin; k<kend; ++k ) {
1590 value1 += A(i ,k) * B(k,j);
1591 value2 += A(i+1UL,k) * B(k,j);
1595 C(i+1UL,j) = value2;
1600 const size_t kbegin( ( IsUpper_v<MT4> )
1601 ?( ( IsLower_v<MT5> )
1602 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
1603 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
1604 :( IsLower_v<MT5> ? j : 0UL ) );
1608 for(
size_t k=kbegin; k<K; ++k ) {
1609 value += A(i,k) * B(k,j);
1634 template<
typename MT3
1637 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1638 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1645 const ForwardFunctor fwd;
1647 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
1648 const OppositeType_t<MT4> tmp(
serial( A ) );
1649 assign( C, fwd( tmp * B ) );
1651 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
1652 const OppositeType_t<MT5> tmp(
serial( B ) );
1653 assign( C, fwd( A * tmp ) );
1655 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
1656 const OppositeType_t<MT4> tmp(
serial( A ) );
1657 assign( C, fwd( tmp * B ) );
1660 const OppositeType_t<MT5> tmp(
serial( B ) );
1661 assign( C, fwd( A * tmp ) );
1680 template<
typename MT3
1683 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1684 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1686 selectDefaultAssignKernel( C, A, B );
1705 template<
typename MT3
1708 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1709 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
1738 template<
typename MT3
1741 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1742 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1744 selectLargeAssignKernel( C, A, B );
1750 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 1763 template<
typename MT3
1766 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1767 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
1769 using ET = ElementType_t<MT3>;
1771 if( IsTriangular_v<MT4> ) {
1773 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
1775 else if( IsTriangular_v<MT5> ) {
1777 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
1780 gemm( C, A, B, ET(1), ET(0) );
1800 template<
typename MT
1802 friend inline auto assign( SparseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
1803 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1807 using TmpType = If_t< SO, OppositeType, ResultType >;
1819 const ForwardFunctor fwd;
1821 const TmpType tmp(
serial( rhs ) );
1822 assign( ~lhs, fwd( tmp ) );
1842 template<
typename MT >
1843 friend inline auto assign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
1844 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1853 const ForwardFunctor fwd;
1855 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
1856 assign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
1857 else if( IsSymmetric_v<MT1> )
1858 assign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
1860 assign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
1878 template<
typename MT
1880 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
1881 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
1888 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
1902 DMatDMatMultExpr::selectAddAssignKernel( ~lhs, A, B );
1918 template<
typename MT3
1921 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1923 if( ( IsDiagonal_v<MT5> ) ||
1924 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
1925 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
1926 selectSmallAddAssignKernel( C, A, B );
1928 selectBlasAddAssignKernel( C, A, B );
1947 template<
typename MT3
1950 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
1951 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
1953 const size_t M( A.rows() );
1954 const size_t N( B.columns() );
1955 const size_t K( A.columns() );
1959 for(
size_t i=0UL; i<M; ++i )
1961 const size_t kbegin( ( IsUpper_v<MT4> )
1962 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
1964 const size_t kend( ( IsLower_v<MT4> )
1965 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
1969 for(
size_t k=kbegin; k<kend; ++k )
1971 const size_t jbegin( ( IsUpper_v<MT5> )
1972 ?( ( IsStrictlyUpper_v<MT5> )
1973 ?(
UPP ?
max(i,k+1UL) : k+1UL )
1974 :(
UPP ?
max(i,k) : k ) )
1975 :(
UPP ? i : 0UL ) );
1976 const size_t jend( ( IsLower_v<MT5> )
1977 ?( ( IsStrictlyLower_v<MT5> )
1978 ?(
LOW ?
min(i+1UL,k) : k )
1979 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
1980 :(
LOW ? i+1UL : N ) );
1982 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
1985 const size_t jnum( jend - jbegin );
1986 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1988 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1989 C(i,j ) += A(i,k) * B(k,j );
1990 C(i,j+1UL) += A(i,k) * B(k,j+1UL);
1993 C(i,jpos) += A(i,k) * B(k,jpos);
2015 template<
typename MT3
2018 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2019 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2023 const size_t M( A.rows() );
2024 const size_t N( B.columns() );
2026 for(
size_t i=0UL; i<M; ++i )
2028 const size_t jbegin( ( IsUpper_v<MT4> )
2029 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
2031 const size_t jend( ( IsLower_v<MT4> )
2032 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
2036 const size_t jnum( jend - jbegin );
2037 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2039 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2040 C(i,j ) += A(i,j ) * B(j ,j );
2041 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL);
2044 C(i,jpos) += A(i,jpos) * B(jpos,jpos);
2065 template<
typename MT3
2068 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2069 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
2073 const size_t M( A.rows() );
2074 const size_t N( B.columns() );
2076 for(
size_t i=0UL; i<M; ++i )
2078 const size_t jbegin( ( IsUpper_v<MT5> )
2079 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
2081 const size_t jend( ( IsLower_v<MT5> )
2082 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
2086 const size_t jnum( jend - jbegin );
2087 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2089 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2090 C(i,j ) += A(i,i) * B(i,j );
2091 C(i,j+1UL) += A(i,i) * B(i,j+1UL);
2094 C(i,jpos) += A(i,i) * B(i,jpos);
2115 template<
typename MT3
2118 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2119 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
2123 for(
size_t i=0UL; i<A.rows(); ++i ) {
2124 C(i,i) += A(i,i) * B(i,i);
2144 template<
typename MT3
2147 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2148 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2150 selectDefaultAddAssignKernel( C, A, B );
2170 template<
typename MT3
2173 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2174 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2176 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
2178 const size_t M( A.rows() );
2179 const size_t N( B.columns() );
2180 const size_t K( A.columns() );
2184 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
2189 if( IsIntegral_v<ElementType> )
2192 for(
size_t i=0UL; i<M; ++i )
2194 const size_t kbegin( ( IsUpper_v<MT4> )
2195 ?( ( IsLower_v<MT5> )
2196 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2197 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2198 :( IsLower_v<MT5> ? j : 0UL ) );
2199 const size_t kend( ( IsLower_v<MT4> )
2200 ?( ( IsUpper_v<MT5> )
2201 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
2202 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
2203 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
2214 for(
size_t k=kbegin; k<kend; ++k ) {
2216 xmm1 += a1 * B.load(k,j );
2217 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2218 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2219 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2220 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2221 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
2222 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
2223 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
2226 C.store( i, j , xmm1 );
2228 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2229 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2230 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2231 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
2232 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
2233 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
2242 for( ; (i+2UL) <= M; i+=2UL )
2244 const size_t kbegin( ( IsUpper_v<MT4> )
2245 ?( ( IsLower_v<MT5> )
2246 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2247 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2248 :( IsLower_v<MT5> ? j : 0UL ) );
2249 const size_t kend( ( IsLower_v<MT4> )
2250 ?( ( IsUpper_v<MT5> )
2251 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
2252 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2253 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
2260 SIMDType xmm6 ( C.load(i+1UL,j ) );
2266 for(
size_t k=kbegin; k<kend; ++k ) {
2286 C.store( i , j , xmm1 );
2288 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2289 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2290 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
2291 C.store( i+1UL, j , xmm6 );
2292 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
2293 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
2294 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
2295 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
2300 const size_t kbegin( ( IsUpper_v<MT4> )
2301 ?( ( IsLower_v<MT5> )
2302 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2303 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2304 :( IsLower_v<MT5> ? j : 0UL ) );
2305 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
2313 for(
size_t k=kbegin; k<kend; ++k ) {
2315 xmm1 += a1 * B.load(k,j );
2316 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2317 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2318 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2319 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
2322 C.store( i, j , xmm1 );
2324 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2325 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2326 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
2334 for( ; (i+2UL) <= M; i+=2UL )
2336 const size_t kbegin( ( IsUpper_v<MT4> )
2337 ?( ( IsLower_v<MT5> )
2338 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2339 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2340 :( IsLower_v<MT5> ? j : 0UL ) );
2341 const size_t kend( ( IsLower_v<MT4> )
2342 ?( ( IsUpper_v<MT5> )
2343 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
2344 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2345 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
2356 for(
size_t k=kbegin; k<kend; ++k ) {
2373 C.store( i , j , xmm1 );
2375 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2376 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
2377 C.store( i+1UL, j , xmm5 );
2378 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
2379 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
2380 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
2385 const size_t kbegin( ( IsUpper_v<MT4> )
2386 ?( ( IsLower_v<MT5> )
2387 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2388 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2389 :( IsLower_v<MT5> ? j : 0UL ) );
2390 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
2397 for(
size_t k=kbegin; k<kend; ++k ) {
2399 xmm1 += a1 * B.load(k,j );
2400 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2401 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2402 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
2405 C.store( i, j , xmm1 );
2407 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2408 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
2416 for( ; (i+2UL) <= M; i+=2UL )
2418 const size_t kbegin( ( IsUpper_v<MT4> )
2419 ?( ( IsLower_v<MT5> )
2420 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2421 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2422 :( IsLower_v<MT5> ? j : 0UL ) );
2423 const size_t kend( ( IsLower_v<MT4> )
2424 ?( ( IsUpper_v<MT5> )
2425 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
2426 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2427 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
2436 for(
size_t k=kbegin; k<kend; ++k ) {
2450 C.store( i , j , xmm1 );
2452 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
2453 C.store( i+1UL, j , xmm4 );
2454 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
2455 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
2460 const size_t kbegin( ( IsUpper_v<MT4> )
2461 ?( ( IsLower_v<MT5> )
2462 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2464 :( IsLower_v<MT5> ? j : 0UL ) );
2465 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
2471 for(
size_t k=kbegin; k<kend; ++k ) {
2473 xmm1 += a1 * B.load(k,j );
2474 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
2475 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
2478 C.store( i, j , xmm1 );
2480 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
2487 size_t i(
LOW ? j : 0UL );
2489 for( ; (i+4UL) <= iend; i+=4UL )
2491 const size_t kbegin( ( IsUpper_v<MT4> )
2492 ?( ( IsLower_v<MT5> )
2493 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2494 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2495 :( IsLower_v<MT5> ? j : 0UL ) );
2496 const size_t kend( ( IsLower_v<MT4> )
2497 ?( ( IsUpper_v<MT5> )
2498 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
2499 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
2500 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2511 for(
size_t k=kbegin; k<kend; ++k ) {
2528 C.store( i , j , xmm1 );
2530 C.store( i+1UL, j , xmm3 );
2531 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2532 C.store( i+2UL, j , xmm5 );
2533 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2534 C.store( i+3UL, j , xmm7 );
2535 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
2538 for( ; (i+3UL) <= iend; i+=3UL )
2540 const size_t kbegin( ( IsUpper_v<MT4> )
2541 ?( ( IsLower_v<MT5> )
2542 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2543 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2544 :( IsLower_v<MT5> ? j : 0UL ) );
2545 const size_t kend( ( IsLower_v<MT4> )
2546 ?( ( IsUpper_v<MT5> )
2547 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
2548 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
2549 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2558 for(
size_t k=kbegin; k<kend; ++k ) {
2572 C.store( i , j , xmm1 );
2574 C.store( i+1UL, j , xmm3 );
2575 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
2576 C.store( i+2UL, j , xmm5 );
2577 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
2580 for( ; (i+2UL) <= iend; i+=2UL )
2582 const size_t kbegin( ( IsUpper_v<MT4> )
2583 ?( ( IsLower_v<MT5> )
2584 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2585 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2586 :( IsLower_v<MT5> ? j : 0UL ) );
2587 const size_t kend( ( IsLower_v<MT4> )
2588 ?( ( IsUpper_v<MT5> )
2589 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
2590 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
2591 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
2600 for( ; (k+2UL) <= kend; k+=2UL ) {
2605 const SIMDType b1( B.load(k ,j ) );
2607 const SIMDType b3( B.load(k+1UL,j ) );
2619 for( ; k<kend; ++k ) {
2630 C.store( i , j , xmm1+xmm5 );
2631 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
2632 C.store( i+1UL, j , xmm3+xmm7 );
2633 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
2638 const size_t kbegin( ( IsUpper_v<MT4> )
2639 ?( ( IsLower_v<MT5> )
2640 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2641 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2642 :( IsLower_v<MT5> ? j : 0UL ) );
2643 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
2650 for( ; (k+2UL) <= kend; k+=2UL ) {
2653 xmm1 += a1 * B.load(k ,j );
2654 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
2655 xmm3 += a2 * B.load(k+1UL,j );
2656 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
2659 for( ; k<kend; ++k ) {
2661 xmm1 += a1 * B.load(k,j );
2665 C.store( i, j , xmm1+xmm3 );
2666 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
2673 size_t i(
LOW ? j : 0UL );
2675 for( ; (i+4UL) <= iend; i+=4UL )
2677 const size_t kbegin( ( IsUpper_v<MT4> )
2678 ?( ( IsLower_v<MT5> )
2679 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2680 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2681 :( IsLower_v<MT5> ? j : 0UL ) );
2682 const size_t kend( ( IsLower_v<MT4> )
2683 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
2693 for( ; (k+2UL) <= kend; k+=2UL ) {
2695 const SIMDType b2( B.load(k+1UL,j) );
2696 xmm1 +=
set( A(i ,k ) ) * b1;
2697 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2698 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2699 xmm4 +=
set( A(i+3UL,k ) ) * b1;
2700 xmm5 +=
set( A(i ,k+1UL) ) * b2;
2701 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
2702 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
2703 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
2706 for( ; k<kend; ++k ) {
2708 xmm1 +=
set( A(i ,k) ) * b1;
2709 xmm2 +=
set( A(i+1UL,k) ) * b1;
2710 xmm3 +=
set( A(i+2UL,k) ) * b1;
2711 xmm4 +=
set( A(i+3UL,k) ) * b1;
2714 C.store( i , j, xmm1+xmm5 );
2715 C.store( i+1UL, j, xmm2+xmm6 );
2716 C.store( i+2UL, j, xmm3+xmm7 );
2717 C.store( i+3UL, j, xmm4+xmm8 );
2720 for( ; (i+3UL) <= iend; i+=3UL )
2722 const size_t kbegin( ( IsUpper_v<MT4> )
2723 ?( ( IsLower_v<MT5> )
2724 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2725 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2726 :( IsLower_v<MT5> ? j : 0UL ) );
2727 const size_t kend( ( IsLower_v<MT4> )
2728 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
2737 for( ; (k+2UL) <= kend; k+=2UL ) {
2739 const SIMDType b2( B.load(k+1UL,j) );
2740 xmm1 +=
set( A(i ,k ) ) * b1;
2741 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2742 xmm3 +=
set( A(i+2UL,k ) ) * b1;
2743 xmm4 +=
set( A(i ,k+1UL) ) * b2;
2744 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
2745 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
2748 for( ; k<kend; ++k ) {
2750 xmm1 +=
set( A(i ,k) ) * b1;
2751 xmm2 +=
set( A(i+1UL,k) ) * b1;
2752 xmm3 +=
set( A(i+2UL,k) ) * b1;
2755 C.store( i , j, xmm1+xmm4 );
2756 C.store( i+1UL, j, xmm2+xmm5 );
2757 C.store( i+2UL, j, xmm3+xmm6 );
2760 for( ; (i+2UL) <= iend; i+=2UL )
2762 const size_t kbegin( ( IsUpper_v<MT4> )
2763 ?( ( IsLower_v<MT5> )
2764 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2765 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2766 :( IsLower_v<MT5> ? j : 0UL ) );
2767 const size_t kend( ( IsLower_v<MT4> )
2768 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2776 for( ; (k+2UL) <= kend; k+=2UL ) {
2778 const SIMDType b2( B.load(k+1UL,j) );
2779 xmm1 +=
set( A(i ,k ) ) * b1;
2780 xmm2 +=
set( A(i+1UL,k ) ) * b1;
2781 xmm3 +=
set( A(i ,k+1UL) ) * b2;
2782 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
2785 for( ; k<kend; ++k ) {
2787 xmm1 +=
set( A(i ,k) ) * b1;
2788 xmm2 +=
set( A(i+1UL,k) ) * b1;
2791 C.store( i , j, xmm1+xmm3 );
2792 C.store( i+1UL, j, xmm2+xmm4 );
2797 const size_t kbegin( ( IsUpper_v<MT4> )
2798 ?( ( IsLower_v<MT5> )
2799 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2800 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2801 :( IsLower_v<MT5> ? j : 0UL ) );
2807 for( ; (k+2UL) <= K; k+=2UL ) {
2808 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
2809 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
2813 xmm1 +=
set( A(i,k) ) * B.load(k,j);
2816 C.store( i, j, xmm1+xmm2 );
2820 for( ; remainder && j<N; ++j )
2822 const size_t iend(
UPP ? j+1UL : M );
2823 size_t i(
LOW ? j : 0UL );
2825 for( ; (i+2UL) <= iend; i+=2UL )
2827 const size_t kbegin( ( IsUpper_v<MT4> )
2828 ?( ( IsLower_v<MT5> )
2829 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2830 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2831 :( IsLower_v<MT5> ? j : 0UL ) );
2832 const size_t kend( ( IsLower_v<MT4> )
2833 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
2839 for(
size_t k=kbegin; k<kend; ++k ) {
2840 value1 += A(i ,k) * B(k,j);
2841 value2 += A(i+1UL,k) * B(k,j);
2845 C(i+1UL,j) = value2;
2850 const size_t kbegin( ( IsUpper_v<MT4> )
2851 ?( ( IsLower_v<MT5> )
2852 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
2853 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
2854 :( IsLower_v<MT5> ? j : 0UL ) );
2858 for(
size_t k=kbegin; k<K; ++k ) {
2859 value += A(i,k) * B(k,j);
2884 template<
typename MT3
2887 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2888 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2895 const ForwardFunctor fwd;
2897 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
2898 const OppositeType_t<MT4> tmp(
serial( A ) );
2899 addAssign( C, fwd( tmp * B ) );
2901 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
2902 const OppositeType_t<MT5> tmp(
serial( B ) );
2903 addAssign( C, fwd( A * tmp ) );
2905 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
2906 const OppositeType_t<MT4> tmp(
serial( A ) );
2907 addAssign( C, fwd( tmp * B ) );
2910 const OppositeType_t<MT5> tmp(
serial( B ) );
2911 addAssign( C, fwd( A * tmp ) );
2931 template<
typename MT3
2934 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2935 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2937 selectDefaultAddAssignKernel( C, A, B );
2957 template<
typename MT3
2960 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2961 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
2987 template<
typename MT3
2990 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
2991 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
2993 selectLargeAddAssignKernel( C, A, B );
2999 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 3013 template<
typename MT3
3016 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3017 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
3019 using ET = ElementType_t<MT3>;
3021 if( IsTriangular_v<MT4> ) {
3022 ResultType_t<MT3> tmp(
serial( B ) );
3023 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
3024 addAssign( C, tmp );
3026 else if( IsTriangular_v<MT5> ) {
3027 ResultType_t<MT3> tmp(
serial( A ) );
3028 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
3029 addAssign( C, tmp );
3032 gemm( C, A, B, ET(1), ET(1) );
3054 template<
typename MT >
3055 friend inline auto addAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
3056 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3065 const ForwardFunctor fwd;
3067 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
3068 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
3069 else if( IsSymmetric_v<MT1> )
3070 addAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
3072 addAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
3094 template<
typename MT
3096 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
3097 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
3104 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
3118 DMatDMatMultExpr::selectSubAssignKernel( ~lhs, A, B );
3134 template<
typename MT3
3137 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3139 if( ( IsDiagonal_v<MT5> ) ||
3140 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
3141 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
3142 selectSmallSubAssignKernel( C, A, B );
3144 selectBlasSubAssignKernel( C, A, B );
3163 template<
typename MT3
3166 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3167 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3169 const size_t M( A.rows() );
3170 const size_t N( B.columns() );
3171 const size_t K( A.columns() );
3175 for(
size_t i=0UL; i<M; ++i )
3177 const size_t kbegin( ( IsUpper_v<MT4> )
3178 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3180 const size_t kend( ( IsLower_v<MT4> )
3181 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3185 for(
size_t k=kbegin; k<kend; ++k )
3187 const size_t jbegin( ( IsUpper_v<MT5> )
3188 ?( ( IsStrictlyUpper_v<MT5> )
3189 ?(
UPP ?
max(i,k+1UL) : k+1UL )
3190 :(
UPP ?
max(i,k) : k ) )
3191 :(
UPP ? i : 0UL ) );
3192 const size_t jend( ( IsLower_v<MT5> )
3193 ?( ( IsStrictlyLower_v<MT5> )
3194 ?(
LOW ?
min(i+1UL,k) : k )
3195 :(
LOW ?
min(i,k)+1UL : k+1UL ) )
3196 :(
LOW ? i+1UL : N ) );
3198 if( (
LOW ||
UPP ) && ( jbegin >= jend ) )
continue;
3201 const size_t jnum( jend - jbegin );
3202 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3204 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3205 C(i,j ) -= A(i,k) * B(k,j );
3206 C(i,j+1UL) -= A(i,k) * B(k,j+1UL);
3209 C(i,jpos) -= A(i,k) * B(k,jpos);
3231 template<
typename MT3
3234 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3235 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3239 const size_t M( A.rows() );
3240 const size_t N( B.columns() );
3242 for(
size_t i=0UL; i<M; ++i )
3244 const size_t jbegin( ( IsUpper_v<MT4> )
3245 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
3247 const size_t jend( ( IsLower_v<MT4> )
3248 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
3252 const size_t jnum( jend - jbegin );
3253 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3255 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3256 C(i,j ) -= A(i,j ) * B(j ,j );
3257 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL);
3260 C(i,jpos) -= A(i,jpos) * B(jpos,jpos);
3281 template<
typename MT3
3284 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3285 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
3289 const size_t M( A.rows() );
3290 const size_t N( B.columns() );
3292 for(
size_t i=0UL; i<M; ++i )
3294 const size_t jbegin( ( IsUpper_v<MT5> )
3295 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
3297 const size_t jend( ( IsLower_v<MT5> )
3298 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
3302 const size_t jnum( jend - jbegin );
3303 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
3305 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
3306 C(i,j ) -= A(i,i) * B(i,j );
3307 C(i,j+1UL) -= A(i,i) * B(i,j+1UL);
3310 C(i,jpos) -= A(i,i) * B(i,jpos);
3331 template<
typename MT3
3334 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3335 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
3339 for(
size_t i=0UL; i<A.rows(); ++i ) {
3340 C(i,i) -= A(i,i) * B(i,i);
3360 template<
typename MT3
3363 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3364 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3366 selectDefaultSubAssignKernel( C, A, B );
3386 template<
typename MT3
3389 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
3390 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
3392 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
3394 const size_t M( A.rows() );
3395 const size_t N( B.columns() );
3396 const size_t K( A.columns() );
3400 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3405 if( IsIntegral_v<ElementType> )
3408 for(
size_t i=0UL; i<M; ++i )
3410 const size_t kbegin( ( IsUpper_v<MT4> )
3411 ?( ( IsLower_v<MT5> )
3412 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3413 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3414 :( IsLower_v<MT5> ? j : 0UL ) );
3415 const size_t kend( ( IsLower_v<MT4> )
3416 ?( ( IsUpper_v<MT5> )
3417 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
3418 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
3419 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
3430 for(
size_t k=kbegin; k<kend; ++k ) {
3432 xmm1 -= a1 * B.load(k,j );
3433 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3434 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3435 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3436 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3437 xmm6 -= a1 * B.load(k,j+
SIMDSIZE*5UL);
3438 xmm7 -= a1 * B.load(k,j+
SIMDSIZE*6UL);
3439 xmm8 -= a1 * B.load(k,j+
SIMDSIZE*7UL);
3442 C.store( i, j , xmm1 );
3444 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3445 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3446 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3447 C.store( i, j+
SIMDSIZE*5UL, xmm6 );
3448 C.store( i, j+
SIMDSIZE*6UL, xmm7 );
3449 C.store( i, j+
SIMDSIZE*7UL, xmm8 );
3458 for( ; (i+2UL) <= M; i+=2UL )
3460 const size_t kbegin( ( IsUpper_v<MT4> )
3461 ?( ( IsLower_v<MT5> )
3462 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3463 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3464 :( IsLower_v<MT5> ? j : 0UL ) );
3465 const size_t kend( ( IsLower_v<MT4> )
3466 ?( ( IsUpper_v<MT5> )
3467 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
3468 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3469 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
3476 SIMDType xmm6 ( C.load(i+1UL,j ) );
3482 for(
size_t k=kbegin; k<kend; ++k ) {
3502 C.store( i , j , xmm1 );
3504 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3505 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3506 C.store( i , j+
SIMDSIZE*4UL, xmm5 );
3507 C.store( i+1UL, j , xmm6 );
3508 C.store( i+1UL, j+
SIMDSIZE , xmm7 );
3509 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 );
3510 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 );
3511 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 );
3516 const size_t kbegin( ( IsUpper_v<MT4> )
3517 ?( ( IsLower_v<MT5> )
3518 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3519 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3520 :( IsLower_v<MT5> ? j : 0UL ) );
3521 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
3529 for(
size_t k=kbegin; k<kend; ++k ) {
3531 xmm1 -= a1 * B.load(k,j );
3532 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3533 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3534 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3535 xmm5 -= a1 * B.load(k,j+
SIMDSIZE*4UL);
3538 C.store( i, j , xmm1 );
3540 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3541 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3542 C.store( i, j+
SIMDSIZE*4UL, xmm5 );
3550 for( ; (i+2UL) <= M; i+=2UL )
3552 const size_t kbegin( ( IsUpper_v<MT4> )
3553 ?( ( IsLower_v<MT5> )
3554 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3555 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3556 :( IsLower_v<MT5> ? j : 0UL ) );
3557 const size_t kend( ( IsLower_v<MT4> )
3558 ?( ( IsUpper_v<MT5> )
3559 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
3560 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3561 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
3572 for(
size_t k=kbegin; k<kend; ++k ) {
3589 C.store( i , j , xmm1 );
3591 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3592 C.store( i , j+
SIMDSIZE*3UL, xmm4 );
3593 C.store( i+1UL, j , xmm5 );
3594 C.store( i+1UL, j+
SIMDSIZE , xmm6 );
3595 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 );
3596 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 );
3601 const size_t kbegin( ( IsUpper_v<MT4> )
3602 ?( ( IsLower_v<MT5> )
3603 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3604 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3605 :( IsLower_v<MT5> ? j : 0UL ) );
3606 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
3613 for(
size_t k=kbegin; k<kend; ++k ) {
3615 xmm1 -= a1 * B.load(k,j );
3616 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3617 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3618 xmm4 -= a1 * B.load(k,j+
SIMDSIZE*3UL);
3621 C.store( i, j , xmm1 );
3623 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3624 C.store( i, j+
SIMDSIZE*3UL, xmm4 );
3632 for( ; (i+2UL) <= M; i+=2UL )
3634 const size_t kbegin( ( IsUpper_v<MT4> )
3635 ?( ( IsLower_v<MT5> )
3636 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3637 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3638 :( IsLower_v<MT5> ? j : 0UL ) );
3639 const size_t kend( ( IsLower_v<MT4> )
3640 ?( ( IsUpper_v<MT5> )
3641 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
3642 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3643 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
3652 for(
size_t k=kbegin; k<kend; ++k ) {
3666 C.store( i , j , xmm1 );
3668 C.store( i , j+
SIMDSIZE*2UL, xmm3 );
3669 C.store( i+1UL, j , xmm4 );
3670 C.store( i+1UL, j+
SIMDSIZE , xmm5 );
3671 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 );
3676 const size_t kbegin( ( IsUpper_v<MT4> )
3677 ?( ( IsLower_v<MT5> )
3678 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3679 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3680 :( IsLower_v<MT5> ? j : 0UL ) );
3681 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
3687 for(
size_t k=kbegin; k<kend; ++k ) {
3689 xmm1 -= a1 * B.load(k,j );
3690 xmm2 -= a1 * B.load(k,j+
SIMDSIZE );
3691 xmm3 -= a1 * B.load(k,j+
SIMDSIZE*2UL);
3694 C.store( i, j , xmm1 );
3696 C.store( i, j+
SIMDSIZE*2UL, xmm3 );
3703 size_t i(
LOW ? j : 0UL );
3705 for( ; (i+4UL) <= iend; i+=4UL )
3707 const size_t kbegin( ( IsUpper_v<MT4> )
3708 ?( ( IsLower_v<MT5> )
3709 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3710 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3711 :( IsLower_v<MT5> ? j : 0UL ) );
3712 const size_t kend( ( IsLower_v<MT4> )
3713 ?( ( IsUpper_v<MT5> )
3714 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
3715 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
3716 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3727 for(
size_t k=kbegin; k<kend; ++k ) {
3744 C.store( i , j , xmm1 );
3746 C.store( i+1UL, j , xmm3 );
3747 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3748 C.store( i+2UL, j , xmm5 );
3749 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3750 C.store( i+3UL, j , xmm7 );
3751 C.store( i+3UL, j+
SIMDSIZE, xmm8 );
3754 for( ; (i+3UL) <= iend; i+=3UL )
3756 const size_t kbegin( ( IsUpper_v<MT4> )
3757 ?( ( IsLower_v<MT5> )
3758 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3759 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3760 :( IsLower_v<MT5> ? j : 0UL ) );
3761 const size_t kend( ( IsLower_v<MT4> )
3762 ?( ( IsUpper_v<MT5> )
3763 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
3764 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
3765 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3774 for(
size_t k=kbegin; k<kend; ++k ) {
3788 C.store( i , j , xmm1 );
3790 C.store( i+1UL, j , xmm3 );
3791 C.store( i+1UL, j+
SIMDSIZE, xmm4 );
3792 C.store( i+2UL, j , xmm5 );
3793 C.store( i+2UL, j+
SIMDSIZE, xmm6 );
3796 for( ; (i+2UL) <= iend; i+=2UL )
3798 const size_t kbegin( ( IsUpper_v<MT4> )
3799 ?( ( IsLower_v<MT5> )
3800 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3801 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3802 :( IsLower_v<MT5> ? j : 0UL ) );
3803 const size_t kend( ( IsLower_v<MT4> )
3804 ?( ( IsUpper_v<MT5> )
3805 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
3806 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
3807 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
3816 for( ; (k+2UL) <= kend; k+=2UL ) {
3821 const SIMDType b1( B.load(k ,j ) );
3823 const SIMDType b3( B.load(k+1UL,j ) );
3835 for( ; k<kend; ++k ) {
3846 C.store( i , j , xmm1+xmm5 );
3847 C.store( i , j+
SIMDSIZE, xmm2+xmm6 );
3848 C.store( i+1UL, j , xmm3+xmm7 );
3849 C.store( i+1UL, j+
SIMDSIZE, xmm4+xmm8 );
3854 const size_t kbegin( ( IsUpper_v<MT4> )
3855 ?( ( IsLower_v<MT5> )
3856 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3857 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3858 :( IsLower_v<MT5> ? j : 0UL ) );
3859 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
3866 for( ; (k+2UL) <= kend; k+=2UL ) {
3869 xmm1 -= a1 * B.load(k ,j );
3870 xmm2 -= a1 * B.load(k ,j+
SIMDSIZE);
3871 xmm3 -= a2 * B.load(k+1UL,j );
3872 xmm4 -= a2 * B.load(k+1UL,j+
SIMDSIZE);
3875 for( ; k<kend; ++k ) {
3877 xmm1 -= a1 * B.load(k,j );
3881 C.store( i, j , xmm1+xmm3 );
3882 C.store( i, j+
SIMDSIZE, xmm2+xmm4 );
3889 size_t i(
LOW ? j : 0UL );
3891 for( ; (i+4UL) <= iend; i+=4UL )
3893 const size_t kbegin( ( IsUpper_v<MT4> )
3894 ?( ( IsLower_v<MT5> )
3895 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3896 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3897 :( IsLower_v<MT5> ? j : 0UL ) );
3898 const size_t kend( ( IsLower_v<MT4> )
3899 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
3909 for( ; (k+2UL) <= kend; k+=2UL ) {
3911 const SIMDType b2( B.load(k+1UL,j) );
3912 xmm1 -=
set( A(i ,k ) ) * b1;
3913 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3914 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3915 xmm4 -=
set( A(i+3UL,k ) ) * b1;
3916 xmm5 -=
set( A(i ,k+1UL) ) * b2;
3917 xmm6 -=
set( A(i+1UL,k+1UL) ) * b2;
3918 xmm7 -=
set( A(i+2UL,k+1UL) ) * b2;
3919 xmm8 -=
set( A(i+3UL,k+1UL) ) * b2;
3922 for( ; k<kend; ++k ) {
3924 xmm1 -=
set( A(i ,k) ) * b1;
3925 xmm2 -=
set( A(i+1UL,k) ) * b1;
3926 xmm3 -=
set( A(i+2UL,k) ) * b1;
3927 xmm4 -=
set( A(i+3UL,k) ) * b1;
3930 C.store( i , j, xmm1+xmm5 );
3931 C.store( i+1UL, j, xmm2+xmm6 );
3932 C.store( i+2UL, j, xmm3+xmm7 );
3933 C.store( i+3UL, j, xmm4+xmm8 );
3936 for( ; (i+3UL) <= iend; i+=3UL )
3938 const size_t kbegin( ( IsUpper_v<MT4> )
3939 ?( ( IsLower_v<MT5> )
3940 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3941 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3942 :( IsLower_v<MT5> ? j : 0UL ) );
3943 const size_t kend( ( IsLower_v<MT4> )
3944 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
3953 for( ; (k+2UL) <= kend; k+=2UL ) {
3955 const SIMDType b2( B.load(k+1UL,j) );
3956 xmm1 -=
set( A(i ,k ) ) * b1;
3957 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3958 xmm3 -=
set( A(i+2UL,k ) ) * b1;
3959 xmm4 -=
set( A(i ,k+1UL) ) * b2;
3960 xmm5 -=
set( A(i+1UL,k+1UL) ) * b2;
3961 xmm6 -=
set( A(i+2UL,k+1UL) ) * b2;
3964 for( ; k<kend; ++k ) {
3966 xmm1 -=
set( A(i ,k) ) * b1;
3967 xmm2 -=
set( A(i+1UL,k) ) * b1;
3968 xmm3 -=
set( A(i+2UL,k) ) * b1;
3971 C.store( i , j, xmm1+xmm4 );
3972 C.store( i+1UL, j, xmm2+xmm5 );
3973 C.store( i+2UL, j, xmm3+xmm6 );
3976 for( ; (i+2UL) <= iend; i+=2UL )
3978 const size_t kbegin( ( IsUpper_v<MT4> )
3979 ?( ( IsLower_v<MT5> )
3980 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
3981 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
3982 :( IsLower_v<MT5> ? j : 0UL ) );
3983 const size_t kend( ( IsLower_v<MT4> )
3984 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
3992 for( ; (k+2UL) <= kend; k+=2UL ) {
3994 const SIMDType b2( B.load(k+1UL,j) );
3995 xmm1 -=
set( A(i ,k ) ) * b1;
3996 xmm2 -=
set( A(i+1UL,k ) ) * b1;
3997 xmm3 -=
set( A(i ,k+1UL) ) * b2;
3998 xmm4 -=
set( A(i+1UL,k+1UL) ) * b2;
4001 for( ; k<kend; ++k ) {
4003 xmm1 -=
set( A(i ,k) ) * b1;
4004 xmm2 -=
set( A(i+1UL,k) ) * b1;
4007 C.store( i , j, xmm1+xmm3 );
4008 C.store( i+1UL, j, xmm2+xmm4 );
4013 const size_t kbegin( ( IsUpper_v<MT4> )
4014 ?( ( IsLower_v<MT5> )
4015 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4016 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4017 :( IsLower_v<MT5> ? j : 0UL ) );
4023 for( ; (k+2UL) <= K; k+=2UL ) {
4024 xmm1 -=
set( A(i,k ) ) * B.load(k ,j);
4025 xmm2 -=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
4029 xmm1 -=
set( A(i,k) ) * B.load(k,j);
4032 C.store( i, j, xmm1+xmm2 );
4036 for( ; remainder && j<N; ++j )
4038 const size_t iend(
UPP ? j+1UL : M );
4039 size_t i(
LOW ? j : 0UL );
4041 for( ; (i+2UL) <= iend; i+=2UL )
4043 const size_t kbegin( ( IsUpper_v<MT4> )
4044 ?( ( IsLower_v<MT5> )
4045 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4046 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4047 :( IsLower_v<MT5> ? j : 0UL ) );
4048 const size_t kend( ( IsLower_v<MT4> )
4049 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
4055 for(
size_t k=kbegin; k<kend; ++k ) {
4056 value1 -= A(i ,k) * B(k,j);
4057 value2 -= A(i+1UL,k) * B(k,j);
4061 C(i+1UL,j) = value2;
4066 const size_t kbegin( ( IsUpper_v<MT4> )
4067 ?( ( IsLower_v<MT5> )
4068 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
4069 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
4070 :( IsLower_v<MT5> ? j : 0UL ) );
4074 for(
size_t k=kbegin; k<K; ++k ) {
4075 value -= A(i,k) * B(k,j);
4100 template<
typename MT3
4103 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4104 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4111 const ForwardFunctor fwd;
4113 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
4114 const OppositeType_t<MT4> tmp(
serial( A ) );
4115 subAssign( C, fwd( tmp * B ) );
4117 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
4118 const OppositeType_t<MT5> tmp(
serial( B ) );
4119 subAssign( C, fwd( A * tmp ) );
4121 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
4122 const OppositeType_t<MT4> tmp(
serial( A ) );
4123 subAssign( C, fwd( tmp * B ) );
4126 const OppositeType_t<MT5> tmp(
serial( B ) );
4127 subAssign( C, fwd( A * tmp ) );
4147 template<
typename MT3
4150 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4151 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4153 selectDefaultSubAssignKernel( C, A, B );
4173 template<
typename MT3
4176 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4177 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5> >
4203 template<
typename MT3
4206 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4207 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4209 selectLargeSubAssignKernel( C, A, B );
4215 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 4229 template<
typename MT3
4232 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B )
4233 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5> >
4235 using ET = ElementType_t<MT3>;
4237 if( IsTriangular_v<MT4> ) {
4238 ResultType_t<MT3> tmp(
serial( B ) );
4239 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ), ET(1) );
4240 subAssign( C, tmp );
4242 else if( IsTriangular_v<MT5> ) {
4243 ResultType_t<MT3> tmp(
serial( A ) );
4244 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ), ET(1) );
4245 subAssign( C, tmp );
4248 gemm( C, A, B, ET(-1), ET(1) );
4270 template<
typename MT >
4271 friend inline auto subAssign( Matrix<MT,true>& lhs,
const DMatDMatMultExpr& rhs )
4272 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4281 const ForwardFunctor fwd;
4283 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4284 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) *
trans( rhs.rhs_ ) ) );
4285 else if( IsSymmetric_v<MT1> )
4286 subAssign( ~lhs, fwd(
trans( rhs.lhs_ ) * rhs.rhs_ ) );
4288 subAssign( ~lhs, fwd( rhs.lhs_ *
trans( rhs.rhs_ ) ) );
4310 template<
typename MT
4312 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatDMatMultExpr& rhs )
4324 schurAssign( ~lhs, tmp );
4356 template<
typename MT
4359 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4366 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
4369 else if( rhs.lhs_.columns() == 0UL ) {
4404 template<
typename MT
4407 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4411 using TmpType = If_t< SO, OppositeType, ResultType >;
4423 const ForwardFunctor fwd;
4425 const TmpType tmp( rhs );
4446 template<
typename MT >
4448 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4457 const ForwardFunctor fwd;
4459 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4461 else if( IsSymmetric_v<MT1> )
4485 template<
typename MT
4488 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4495 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4529 template<
typename MT >
4531 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4540 const ForwardFunctor fwd;
4542 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4544 else if( IsSymmetric_v<MT1> )
4572 template<
typename MT
4575 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
4582 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || rhs.lhs_.columns() == 0UL ) {
4616 template<
typename MT >
4618 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
4627 const ForwardFunctor fwd;
4629 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
4631 else if( IsSymmetric_v<MT1> )
4656 template<
typename MT
4716 template<
typename MT1
4723 class DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >
4724 :
public MatScalarMultExpr< DenseMatrix< DMatScalarMultExpr< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, ST, false >, false > >
4725 ,
private Computation
4730 using MMM = DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4732 using RES = ResultType_t<MMM>;
4733 using RT1 = ResultType_t<MT1>;
4734 using RT2 = ResultType_t<MT2>;
4735 using ET1 = ElementType_t<RT1>;
4736 using ET2 = ElementType_t<RT2>;
4737 using CT1 = CompositeType_t<MT1>;
4738 using CT2 = CompositeType_t<MT2>;
4743 static constexpr
bool evaluateLeft = ( IsComputation_v<MT1> || RequiresEvaluation_v<MT1> );
4748 static constexpr
bool evaluateRight = ( IsComputation_v<MT2> || RequiresEvaluation_v<MT2> );
4752 static constexpr
bool SYM = ( SF && !( HF || LF || UF ) );
4753 static constexpr
bool HERM = ( HF && !( LF || UF ) );
4754 static constexpr
bool LOW = ( LF || ( ( SF || HF ) && UF ) );
4755 static constexpr
bool UPP = ( UF || ( ( SF || HF ) && LF ) );
4764 template<
typename T1,
typename T2,
typename T3 >
4765 static constexpr
bool CanExploitSymmetry_v =
4766 ( IsColumnMajorMatrix_v<T1> && ( IsSymmetric_v<T2> || IsSymmetric_v<T3> ) );
4774 template<
typename T1,
typename T2,
typename T3 >
4775 static constexpr
bool IsEvaluationRequired_v =
4776 ( ( evaluateLeft || evaluateRight ) && !CanExploitSymmetry_v<T1,T2,T3> );
4783 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4784 static constexpr
bool UseBlasKernel_v =
4786 !SYM && !HERM && !LOW && !UPP &&
4787 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
4788 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
4789 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
4790 !IsDiagonal_v<T2> && !IsDiagonal_v<T3> &&
4791 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4792 IsBLASCompatible_v< ElementType_t<T1> > &&
4793 IsBLASCompatible_v< ElementType_t<T2> > &&
4794 IsBLASCompatible_v< ElementType_t<T3> > &&
4795 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
4796 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
4797 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
4804 template<
typename T1,
typename T2,
typename T3,
typename T4 >
4805 static constexpr
bool UseVectorizedDefaultKernel_v =
4806 ( useOptimizedKernels &&
4807 !IsDiagonal_v<T3> &&
4808 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
4809 IsSIMDCombinable_v< ElementType_t<T1>
4813 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
4814 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
4821 using ForwardFunctor =
If_t< HERM
4837 using This = DMatScalarMultExpr<MMM,ST,false>;
4840 using BaseType = DenseMatrix<This,false>;
4844 , DeclHermTrait< MultTrait_t<RES,ST> >
4846 , DeclSymTrait< MultTrait_t<RES,ST> >
4849 , DeclDiagTrait< MultTrait_t<RES,ST> >
4850 , DeclLowTrait< MultTrait_t<RES,ST> > >
4852 , DeclUppTrait< MultTrait_t<RES,ST> >
4853 , MultTrait<RES,ST> > > > >::Type;
4858 using SIMDType = SIMDTrait_t<ElementType>;
4863 using LeftOperand =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>;
4869 using LT = If_t< evaluateLeft, const RT1, CT1 >;
4872 using RT = If_t< evaluateRight, const RT2, CT2 >;
4878 ( !IsDiagonal_v<MT2> &&
4879 MT1::simdEnabled && MT2::simdEnabled &&
4880 IsSIMDCombinable_v<ET1,ET2,ST> &&
4881 HasSIMDAdd_v<ET1,ET2> &&
4882 HasSIMDMult_v<ET1,ET2> );
4886 ( !evaluateLeft && MT1::smpAssignable && !evaluateRight && MT2::smpAssignable );
4932 if( j >=
matrix_.columns() ) {
4935 return (*
this)(i,j);
4944 inline size_t rows()
const {
4954 inline size_t columns()
const {
4985 template<
typename T >
4986 inline bool canAlias(
const T* alias )
const {
4987 return matrix_.canAlias( alias );
4997 template<
typename T >
4998 inline bool isAliased(
const T* alias )
const {
4999 return matrix_.isAliased( alias );
5020 !BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION ||
5022 (
rows() *
columns() < DMATDMATMULT_THRESHOLD ) ) &&
5023 (
rows() *
columns() >= SMP_DMATDMATMULT_THRESHOLD );
5045 template<
typename MT
5048 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
5055 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
5056 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
5058 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
5061 else if( left.columns() == 0UL ) {
5076 DMatScalarMultExpr::selectAssignKernel( ~lhs, A, B, rhs.scalar_ );
5091 template<
typename MT3
5095 static inline void selectAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5097 if( ( IsDiagonal_v<MT5> ) ||
5098 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
5099 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
5100 selectSmallAssignKernel( C, A, B, scalar );
5102 selectBlasAssignKernel( C, A, B, scalar );
5120 template<
typename MT3
5124 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5125 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5127 const size_t M( A.rows() );
5128 const size_t N( B.columns() );
5129 const size_t K( A.columns() );
5133 for(
size_t i=0UL; i<M; ++i )
5135 const size_t kbegin( ( IsUpper_v<MT4> )
5136 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5138 const size_t kend( ( IsLower_v<MT4> )
5139 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5143 if( IsStrictlyTriangular_v<MT4> && kbegin == kend ) {
5144 for(
size_t j=0UL; j<N; ++j ) {
5151 const size_t jbegin( ( IsUpper_v<MT5> )
5152 ?( ( IsStrictlyUpper_v<MT5> )
5153 ?( UPP ?
max(i,kbegin+1UL) : kbegin+1UL )
5154 :( UPP ?
max(i,kbegin) : kbegin ) )
5155 :( UPP ? i : 0UL ) );
5156 const size_t jend( ( IsLower_v<MT5> )
5157 ?( ( IsStrictlyLower_v<MT5> )
5158 ?( LOW ?
min(i+1UL,kbegin) : kbegin )
5159 :( LOW ?
min(i,kbegin)+1UL : kbegin+1UL ) )
5160 :( LOW ? i+1UL : N ) );
5162 if( ( IsUpper_v<MT4> && IsUpper_v<MT5> ) || UPP ) {
5163 for(
size_t j=0UL; j<jbegin; ++j ) {
5167 else if( IsStrictlyUpper_v<MT5> ) {
5170 for(
size_t j=jbegin; j<jend; ++j ) {
5171 C(i,j) = A(i,kbegin) * B(kbegin,j);
5173 if( ( IsLower_v<MT4> && IsLower_v<MT5> ) || LOW ) {
5174 for(
size_t j=jend; j<N; ++j ) {
5178 else if( IsStrictlyLower_v<MT5> ) {
5179 reset( C(i,N-1UL) );
5183 for(
size_t k=kbegin+1UL; k<kend; ++k )
5185 const size_t jbegin( ( IsUpper_v<MT5> )
5186 ?( ( IsStrictlyUpper_v<MT5> )
5187 ?( SYM || HERM || UPP ?
max( i, k+1UL ) : k+1UL )
5188 :( SYM || HERM || UPP ?
max( i, k ) : k ) )
5189 :( SYM || HERM || UPP ? i : 0UL ) );
5190 const size_t jend( ( IsLower_v<MT5> )
5191 ?( ( IsStrictlyLower_v<MT5> )
5192 ?( LOW ?
min(i+1UL,k-1UL) : k-1UL )
5193 :( LOW ?
min(i+1UL,k) : k ) )
5194 :( LOW ? i+1UL : N ) );
5196 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5199 for(
size_t j=jbegin; j<jend; ++j ) {
5200 C(i,j) += A(i,k) * B(k,j);
5202 if( IsLower_v<MT5> ) {
5203 C(i,jend) = A(i,k) * B(k,jend);
5208 const size_t jbegin( ( IsUpper_v<MT4> && IsUpper_v<MT5> )
5209 ?( IsStrictlyUpper_v<MT4> || IsStrictlyUpper_v<MT5> ? i+1UL : i )
5210 :( SYM || HERM || UPP ? i : 0UL ) );
5211 const size_t jend( ( IsLower_v<MT4> && IsLower_v<MT5> )
5212 ?( IsStrictlyLower_v<MT4> || IsStrictlyLower_v<MT5> ? i : i+1UL )
5213 :( LOW ? i+1UL : N ) );
5215 if( ( SYM || HERM || LOW || UPP ) && ( jbegin > jend ) )
continue;
5218 for(
size_t j=jbegin; j<jend; ++j ) {
5225 for(
size_t i=1UL; i<M; ++i ) {
5226 for(
size_t j=0UL; j<i; ++j ) {
5227 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
5248 template<
typename MT3
5252 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5253 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5257 const size_t M( A.rows() );
5258 const size_t N( B.columns() );
5260 for(
size_t i=0UL; i<M; ++i )
5262 const size_t jbegin( ( IsUpper_v<MT4> )
5263 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
5265 const size_t jend( ( IsLower_v<MT4> )
5266 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
5270 if( IsUpper_v<MT4> ) {
5271 for(
size_t j=0UL; j<jbegin; ++j ) {
5275 for(
size_t j=jbegin; j<jend; ++j ) {
5276 C(i,j) = A(i,j) * B(j,j) * scalar;
5278 if( IsLower_v<MT4> ) {
5279 for(
size_t j=jend; j<N; ++j ) {
5301 template<
typename MT3
5305 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5306 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
5310 const size_t M( A.rows() );
5311 const size_t N( B.columns() );
5313 for(
size_t i=0UL; i<M; ++i )
5315 const size_t jbegin( ( IsUpper_v<MT5> )
5316 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
5318 const size_t jend( ( IsLower_v<MT5> )
5319 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
5323 if( IsUpper_v<MT5> ) {
5324 for(
size_t j=0UL; j<jbegin; ++j ) {
5328 for(
size_t j=jbegin; j<jend; ++j ) {
5329 C(i,j) = A(i,i) * B(i,j) * scalar;
5331 if( IsLower_v<MT5> ) {
5332 for(
size_t j=jend; j<N; ++j ) {
5354 template<
typename MT3
5358 static inline auto selectDefaultAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5359 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
5365 for(
size_t i=0UL; i<A.rows(); ++i ) {
5366 C(i,i) = A(i,i) * B(i,i) * scalar;
5385 template<
typename MT3
5389 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5390 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5392 selectDefaultAssignKernel( C, A, B, scalar );
5411 template<
typename MT3
5415 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
5416 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
5418 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
5420 const size_t M( A.rows() );
5421 const size_t N( B.columns() );
5422 const size_t K( A.columns() );
5426 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
5429 const SIMDType factor(
set( scalar ) );
5433 if( IsIntegral_v<ElementType> )
5435 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*7UL) < jpos; j+=
SIMDSIZE*8UL ) {
5436 for(
size_t i=0UL; i<M; ++i )
5438 const size_t kbegin( ( IsUpper_v<MT4> )
5439 ?( ( IsLower_v<MT5> )
5440 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5441 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5442 :( IsLower_v<MT5> ? j : 0UL ) );
5443 const size_t kend( ( IsLower_v<MT4> )
5444 ?( ( IsUpper_v<MT5> )
5445 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
5446 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
5447 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
5449 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5451 for(
size_t k=kbegin; k<kend; ++k ) {
5452 const SIMDType a1(
set( A(i,k) ) );
5453 xmm1 += a1 * B.load(k,j );
5454 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5455 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5456 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5457 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5458 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
5459 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
5460 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
5463 C.store( i, j , xmm1 * factor );
5464 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5465 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5466 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5467 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5468 C.store( i, j+
SIMDSIZE*5UL, xmm6 * factor );
5469 C.store( i, j+
SIMDSIZE*6UL, xmm7 * factor );
5470 C.store( i, j+
SIMDSIZE*7UL, xmm8 * factor );
5475 for( ; !SYM && !HERM && !LOW && !UPP && (j+
SIMDSIZE*4UL) < jpos; j+=
SIMDSIZE*5UL )
5479 for( ; (i+2UL) <= M; i+=2UL )
5481 const size_t kbegin( ( IsUpper_v<MT4> )
5482 ?( ( IsLower_v<MT5> )
5483 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5484 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5485 :( IsLower_v<MT5> ? j : 0UL ) );
5486 const size_t kend( ( IsLower_v<MT4> )
5487 ?( ( IsUpper_v<MT5> )
5488 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
5489 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5490 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
5492 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
5494 for(
size_t k=kbegin; k<kend; ++k ) {
5495 const SIMDType a1(
set( A(i ,k) ) );
5496 const SIMDType a2(
set( A(i+1UL,k) ) );
5497 const SIMDType b1( B.load(k,j ) );
5498 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5499 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5500 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5501 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
5514 C.store( i , j , xmm1 * factor );
5515 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5516 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5517 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5518 C.store( i , j+
SIMDSIZE*4UL, xmm5 * factor );
5519 C.store( i+1UL, j , xmm6 * factor );
5520 C.store( i+1UL, j+
SIMDSIZE , xmm7 * factor );
5521 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm8 * factor );
5522 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm9 * factor );
5523 C.store( i+1UL, j+
SIMDSIZE*4UL, xmm10 * factor );
5528 const size_t kbegin( ( IsUpper_v<MT4> )
5529 ?( ( IsLower_v<MT5> )
5530 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5531 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5532 :( IsLower_v<MT5> ? j : 0UL ) );
5533 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
5535 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
5537 for(
size_t k=kbegin; k<kend; ++k ) {
5538 const SIMDType a1(
set( A(i,k) ) );
5539 xmm1 += a1 * B.load(k,j );
5540 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5541 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5542 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5543 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
5546 C.store( i, j , xmm1 * factor );
5547 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5548 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5549 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5550 C.store( i, j+
SIMDSIZE*4UL, xmm5 * factor );
5556 const size_t iend( UPP ?
min(j+
SIMDSIZE*4UL,M) : M );
5562 for(
size_t jj=j; jj<jjend; ++jj ) {
5563 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5570 for(
size_t jj=j; jj<jjend; ++jj ) {
5576 for( ; (i+2UL) <= iend; i+=2UL )
5578 const size_t kbegin( ( IsUpper_v<MT4> )
5579 ?( ( IsLower_v<MT5> )
5580 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5581 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5582 :( IsLower_v<MT5> ? j : 0UL ) );
5583 const size_t kend( ( IsLower_v<MT4> )
5584 ?( ( IsUpper_v<MT5> )
5585 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
5586 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5587 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
5589 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5591 for(
size_t k=kbegin; k<kend; ++k ) {
5592 const SIMDType a1(
set( A(i ,k) ) );
5593 const SIMDType a2(
set( A(i+1UL,k) ) );
5594 const SIMDType b1( B.load(k,j ) );
5595 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5596 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5597 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
5608 C.store( i , j , xmm1 * factor );
5609 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5610 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5611 C.store( i , j+
SIMDSIZE*3UL, xmm4 * factor );
5612 C.store( i+1UL, j , xmm5 * factor );
5613 C.store( i+1UL, j+
SIMDSIZE , xmm6 * factor );
5614 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm7 * factor );
5615 C.store( i+1UL, j+
SIMDSIZE*3UL, xmm8 * factor );
5620 const size_t kbegin( ( IsUpper_v<MT4> )
5621 ?( ( IsLower_v<MT5> )
5622 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5623 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5624 :( IsLower_v<MT5> ? j : 0UL ) );
5625 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
5627 SIMDType xmm1, xmm2, xmm3, xmm4;
5629 for(
size_t k=kbegin; k<kend; ++k ) {
5630 const SIMDType a1(
set( A(i,k) ) );
5631 xmm1 += a1 * B.load(k,j );
5632 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5633 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5634 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
5637 C.store( i, j , xmm1 * factor );
5638 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5639 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5640 C.store( i, j+
SIMDSIZE*3UL, xmm4 * factor );
5648 for(
size_t jj=j; jj<jjend; ++jj ) {
5657 const size_t iend( UPP ?
min(j+
SIMDSIZE*3UL,M) : M );
5663 for(
size_t jj=j; jj<jjend; ++jj ) {
5664 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5671 for(
size_t jj=j; jj<jjend; ++jj ) {
5677 for( ; (i+2UL) <= iend; i+=2UL )
5679 const size_t kbegin( ( IsUpper_v<MT4> )
5680 ?( ( IsLower_v<MT5> )
5681 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5682 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5683 :( IsLower_v<MT5> ? j : 0UL ) );
5684 const size_t kend( ( IsLower_v<MT4> )
5685 ?( ( IsUpper_v<MT5> )
5686 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
5687 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5688 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
5690 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5692 for(
size_t k=kbegin; k<kend; ++k ) {
5693 const SIMDType a1(
set( A(i ,k) ) );
5694 const SIMDType a2(
set( A(i+1UL,k) ) );
5695 const SIMDType b1( B.load(k,j ) );
5696 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
5697 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
5706 C.store( i , j , xmm1 * factor );
5707 C.store( i , j+
SIMDSIZE , xmm2 * factor );
5708 C.store( i , j+
SIMDSIZE*2UL, xmm3 * factor );
5709 C.store( i+1UL, j , xmm4 * factor );
5710 C.store( i+1UL, j+
SIMDSIZE , xmm5 * factor );
5711 C.store( i+1UL, j+
SIMDSIZE*2UL, xmm6 * factor );
5716 const size_t kbegin( ( IsUpper_v<MT4> )
5717 ?( ( IsLower_v<MT5> )
5718 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5719 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5720 :( IsLower_v<MT5> ? j : 0UL ) );
5721 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
5723 SIMDType xmm1, xmm2, xmm3;
5725 for(
size_t k=kbegin; k<kend; ++k ) {
5726 const SIMDType a1(
set( A(i,k) ) );
5727 xmm1 += a1 * B.load(k,j );
5728 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
5729 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
5732 C.store( i, j , xmm1 * factor );
5733 C.store( i, j+
SIMDSIZE , xmm2 * factor );
5734 C.store( i, j+
SIMDSIZE*2UL, xmm3 * factor );
5742 for(
size_t jj=j; jj<jjend; ++jj ) {
5751 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
5757 for(
size_t jj=j; jj<jjend; ++jj ) {
5758 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5765 for(
size_t jj=j; jj<jjend; ++jj ) {
5771 for( ; (i+4UL) <= iend; i+=4UL )
5773 const size_t kbegin( ( IsUpper_v<MT4> )
5774 ?( ( IsLower_v<MT5> )
5775 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5776 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5777 :( IsLower_v<MT5> ? j : 0UL ) );
5778 const size_t kend( ( IsLower_v<MT4> )
5779 ?( ( IsUpper_v<MT5> )
5780 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
5781 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
5782 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5784 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5786 for(
size_t k=kbegin; k<kend; ++k ) {
5787 const SIMDType a1(
set( A(i ,k) ) );
5788 const SIMDType a2(
set( A(i+1UL,k) ) );
5789 const SIMDType a3(
set( A(i+2UL,k) ) );
5790 const SIMDType a4(
set( A(i+3UL,k) ) );
5791 const SIMDType b1( B.load(k,j ) );
5792 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5803 C.store( i , j , xmm1 * factor );
5804 C.store( i , j+
SIMDSIZE, xmm2 * factor );
5805 C.store( i+1UL, j , xmm3 * factor );
5806 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
5807 C.store( i+2UL, j , xmm5 * factor );
5808 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
5809 C.store( i+3UL, j , xmm7 * factor );
5810 C.store( i+3UL, j+
SIMDSIZE, xmm8 * factor );
5813 for( ; (i+3UL) <= iend; i+=3UL )
5815 const size_t kbegin( ( IsUpper_v<MT4> )
5816 ?( ( IsLower_v<MT5> )
5817 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5818 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5819 :( IsLower_v<MT5> ? j : 0UL ) );
5820 const size_t kend( ( IsLower_v<MT4> )
5821 ?( ( IsUpper_v<MT5> )
5822 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
5823 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
5824 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5826 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
5828 for(
size_t k=kbegin; k<kend; ++k ) {
5829 const SIMDType a1(
set( A(i ,k) ) );
5830 const SIMDType a2(
set( A(i+1UL,k) ) );
5831 const SIMDType a3(
set( A(i+2UL,k) ) );
5832 const SIMDType b1( B.load(k,j ) );
5833 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5842 C.store( i , j , xmm1 * factor );
5843 C.store( i , j+
SIMDSIZE, xmm2 * factor );
5844 C.store( i+1UL, j , xmm3 * factor );
5845 C.store( i+1UL, j+
SIMDSIZE, xmm4 * factor );
5846 C.store( i+2UL, j , xmm5 * factor );
5847 C.store( i+2UL, j+
SIMDSIZE, xmm6 * factor );
5850 for( ; (i+2UL) <= iend; i+=2UL )
5852 const size_t kbegin( ( IsUpper_v<MT4> )
5853 ?( ( IsLower_v<MT5> )
5854 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5855 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5856 :( IsLower_v<MT5> ? j : 0UL ) );
5857 const size_t kend( ( IsLower_v<MT4> )
5858 ?( ( IsUpper_v<MT5> )
5859 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
5860 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
5861 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
5863 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5866 for( ; (k+2UL) <= kend; k+=2UL ) {
5867 const SIMDType a1(
set( A(i ,k ) ) );
5868 const SIMDType a2(
set( A(i+1UL,k ) ) );
5869 const SIMDType a3(
set( A(i ,k+1UL) ) );
5870 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
5871 const SIMDType b1( B.load(k ,j ) );
5872 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
5873 const SIMDType b3( B.load(k+1UL,j ) );
5874 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
5885 for( ; k<kend; ++k ) {
5886 const SIMDType a1(
set( A(i ,k) ) );
5887 const SIMDType a2(
set( A(i+1UL,k) ) );
5888 const SIMDType b1( B.load(k,j ) );
5889 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
5896 C.store( i , j , (xmm1+xmm5) * factor );
5897 C.store( i , j+
SIMDSIZE, (xmm2+xmm6) * factor );
5898 C.store( i+1UL, j , (xmm3+xmm7) * factor );
5899 C.store( i+1UL, j+
SIMDSIZE, (xmm4+xmm8) * factor );
5904 const size_t kbegin( ( IsUpper_v<MT4> )
5905 ?( ( IsLower_v<MT5> )
5906 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5907 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5908 :( IsLower_v<MT5> ? j : 0UL ) );
5909 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
5911 SIMDType xmm1, xmm2, xmm3, xmm4;
5914 for( ; (k+2UL) <= kend; k+=2UL ) {
5915 const SIMDType a1(
set( A(i,k ) ) );
5916 const SIMDType a2(
set( A(i,k+1UL) ) );
5917 xmm1 += a1 * B.load(k ,j );
5918 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
5919 xmm3 += a2 * B.load(k+1UL,j );
5920 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
5923 for( ; k<kend; ++k ) {
5924 const SIMDType a1(
set( A(i,k) ) );
5925 xmm1 += a1 * B.load(k,j );
5929 C.store( i, j , (xmm1+xmm3) * factor );
5930 C.store( i, j+
SIMDSIZE, (xmm2+xmm4) * factor );
5938 for(
size_t jj=j; jj<jjend; ++jj ) {
5953 for(
size_t jj=j; jj<jjend; ++jj ) {
5954 C(i,jj) = HERM ?
conj( C(jj,i) ) : C(jj,i);
5961 for(
size_t jj=j; jj<jjend; ++jj ) {
5967 for( ; (i+4UL) <= iend; i+=4UL )
5969 const size_t kbegin( ( IsUpper_v<MT4> )
5970 ?( ( IsLower_v<MT5> )
5971 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
5972 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
5973 :( IsLower_v<MT5> ? j : 0UL ) );
5974 const size_t kend( ( IsLower_v<MT4> )
5975 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
5978 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
5981 for( ; (k+2UL) <= kend; k+=2UL ) {
5982 const SIMDType b1( B.load(k ,j) );
5983 const SIMDType b2( B.load(k+1UL,j) );
5984 xmm1 +=
set( A(i ,k ) ) * b1;
5985 xmm2 +=
set( A(i+1UL,k ) ) * b1;
5986 xmm3 +=
set( A(i+2UL,k ) ) * b1;
5987 xmm4 +=
set( A(i+3UL,k ) ) * b1;
5988 xmm5 +=
set( A(i ,k+1UL) ) * b2;
5989 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
5990 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
5991 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
5994 for( ; k<kend; ++k ) {
5995 const SIMDType b1( B.load(k,j) );
5996 xmm1 +=
set( A(i ,k) ) * b1;
5997 xmm2 +=
set( A(i+1UL,k) ) * b1;
5998 xmm3 +=
set( A(i+2UL,k) ) * b1;
5999 xmm4 +=
set( A(i+3UL,k) ) * b1;
6002 C.store( i , j, (xmm1+xmm5) * factor );
6003 C.store( i+1UL, j, (xmm2+xmm6) * factor );
6004 C.store( i+2UL, j, (xmm3+xmm7) * factor );
6005 C.store( i+3UL, j, (xmm4+xmm8) * factor );
6008 for( ; (i+3UL) <= iend; i+=3UL )
6010 const size_t kbegin( ( IsUpper_v<MT4> )
6011 ?( ( IsLower_v<MT5> )
6012 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6013 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6014 :( IsLower_v<MT5> ? j : 0UL ) );
6015 const size_t kend( ( IsLower_v<MT4> )
6016 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
6019 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6022 for( ; (k+2UL) <= kend; k+=2UL ) {
6023 const SIMDType b1( B.load(k ,j) );
6024 const SIMDType b2( B.load(k+1UL,j) );
6025 xmm1 +=
set( A(i ,k ) ) * b1;
6026 xmm2 +=
set( A(i+1UL,k ) ) * b1;
6027 xmm3 +=
set( A(i+2UL,k ) ) * b1;
6028 xmm4 +=
set( A(i ,k+1UL) ) * b2;
6029 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
6030 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
6033 for( ; k<kend; ++k ) {
6034 const SIMDType b1( B.load(k,j) );
6035 xmm1 +=
set( A(i ,k) ) * b1;
6036 xmm2 +=
set( A(i+1UL,k) ) * b1;
6037 xmm3 +=
set( A(i+2UL,k) ) * b1;
6040 C.store( i , j, (xmm1+xmm4) * factor );
6041 C.store( i+1UL, j, (xmm2+xmm5) * factor );
6042 C.store( i+2UL, j, (xmm3+xmm6) * factor );
6045 for( ; (i+2UL) <= iend; i+=2UL )
6047 const size_t kbegin( ( IsUpper_v<MT4> )
6048 ?( ( IsLower_v<MT5> )
6049 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6050 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6051 :( IsLower_v<MT5> ? j : 0UL ) );
6052 const size_t kend( ( IsLower_v<MT4> )
6053 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6056 SIMDType xmm1, xmm2, xmm3, xmm4;
6059 for( ; (k+2UL) <= kend; k+=2UL ) {
6060 const SIMDType b1( B.load(k ,j) );
6061 const SIMDType b2( B.load(k+1UL,j) );
6062 xmm1 +=
set( A(i ,k ) ) * b1;
6063 xmm2 +=
set( A(i+1UL,k ) ) * b1;
6064 xmm3 +=
set( A(i ,k+1UL) ) * b2;
6065 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
6068 for( ; k<kend; ++k ) {
6069 const SIMDType b1( B.load(k,j) );
6070 xmm1 +=
set( A(i ,k) ) * b1;
6071 xmm2 +=
set( A(i+1UL,k) ) * b1;
6074 C.store( i , j, (xmm1+xmm3) * factor );
6075 C.store( i+1UL, j, (xmm2+xmm4) * factor );
6080 const size_t kbegin( ( IsUpper_v<MT4> )
6081 ?( ( IsLower_v<MT5> )
6082 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6083 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6084 :( IsLower_v<MT5> ? j : 0UL ) );
6086 SIMDType xmm1, xmm2;
6089 for( ; (k+2UL) <= K; k+=2UL ) {
6090 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
6091 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
6095 xmm1 +=
set( A(i,k) ) * B.load(k,j);
6098 C.store( i, j, (xmm1+xmm2) * factor );
6106 for(
size_t jj=j; jj<jjend; ++jj ) {
6113 for( ; remainder && j<N; ++j )
6119 C(i,j) = HERM ?
conj( C(j,i) ) : C(j,i);
6128 for( ; (i+2UL) <= M; i+=2UL )
6130 const size_t kbegin( ( IsUpper_v<MT4> )
6131 ?( ( IsLower_v<MT5> )
6132 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6133 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6134 :( IsLower_v<MT5> ? j : 0UL ) );
6135 const size_t kend( ( IsLower_v<MT4> )
6136 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
6142 for(
size_t k=kbegin; k<kend; ++k ) {
6143 value1 += A(i ,k) * B(k,j);
6144 value2 += A(i+1UL,k) * B(k,j);
6147 C(i ,j) = value1 * scalar;
6148 C(i+1UL,j) = value2 * scalar;
6153 const size_t kbegin( ( IsUpper_v<MT4> )
6154 ?( ( IsLower_v<MT5> )
6155 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6156 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6157 :( IsLower_v<MT5> ? j : 0UL ) );
6161 for(
size_t k=kbegin; k<K; ++k ) {
6162 value += A(i,k) * B(k,j);
6165 C(i,j) = value * scalar;
6186 template<
typename MT3
6190 static inline auto selectSmallAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6191 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6198 const ForwardFunctor fwd;
6200 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
6201 const OppositeType_t<MT4> tmp(
serial( A ) );
6202 assign( C, fwd( tmp * B ) * scalar );
6204 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
6205 const OppositeType_t<MT5> tmp(
serial( B ) );
6206 assign( C, fwd( A * tmp ) * scalar );
6208 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
6209 const OppositeType_t<MT4> tmp(
serial( A ) );
6210 assign( C, fwd( tmp * B ) * scalar );
6213 const OppositeType_t<MT5> tmp(
serial( B ) );
6214 assign( C, fwd( A * tmp ) * scalar );
6233 template<
typename MT3
6237 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6238 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6240 selectDefaultAssignKernel( C, A, B, scalar );
6259 template<
typename MT3
6263 static inline auto selectLargeAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6264 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6267 smmm( C, A, B, scalar );
6269 hmmm( C, A, B, scalar );
6271 lmmm( C, A, B, scalar, ST2(0) );
6273 ummm( C, A, B, scalar, ST2(0) );
6275 mmm( C, A, B, scalar, ST2(0) );
6293 template<
typename MT3
6297 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6298 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6300 selectLargeAssignKernel( C, A, B, scalar );
6305 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 6319 template<
typename MT3
6323 static inline auto selectBlasAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6324 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
6326 using ET = ElementType_t<MT3>;
6328 if( IsTriangular_v<MT4> ) {
6330 trmm( C, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6332 else if( IsTriangular_v<MT5> ) {
6334 trmm( C, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
6337 gemm( C, A, B,
ET(scalar),
ET(0) );
6355 template<
typename MT
6358 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6362 using TmpType = If_t< SO, OppositeType, ResultType >;
6374 const ForwardFunctor fwd;
6376 const TmpType tmp(
serial( rhs ) );
6377 assign( ~lhs, fwd( tmp ) );
6395 template<
typename MT >
6397 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6406 const ForwardFunctor fwd;
6408 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6409 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6411 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
6412 assign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
6413 else if( IsSymmetric_v<MT1> )
6414 assign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
6416 assign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
6432 template<
typename MT
6434 friend inline auto addAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
6435 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
6442 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
6443 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
6445 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
6459 DMatScalarMultExpr::selectAddAssignKernel( ~lhs, A, B, rhs.scalar_ );
6474 template<
typename MT3
6478 static inline void selectAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6480 if( ( IsDiagonal_v<MT5> ) ||
6481 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
6482 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
6483 selectSmallAddAssignKernel( C, A, B, scalar );
6485 selectBlasAddAssignKernel( C, A, B, scalar );
6503 template<
typename MT3
6507 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6508 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6511 addAssign( C, tmp );
6529 template<
typename MT3
6533 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6534 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6538 const size_t M( A.rows() );
6539 const size_t N( B.columns() );
6541 for(
size_t i=0UL; i<M; ++i )
6543 const size_t jbegin( ( IsUpper_v<MT4> )
6544 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
6546 const size_t jend( ( IsLower_v<MT4> )
6547 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
6551 const size_t jnum( jend - jbegin );
6552 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6554 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6555 C(i,j ) += A(i,j ) * B(j ,j ) * scalar;
6556 C(i,j+1UL) += A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
6559 C(i,jpos) += A(i,jpos) * B(jpos,jpos) * scalar;
6579 template<
typename MT3
6583 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6584 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
6588 const size_t M( A.rows() );
6589 const size_t N( B.columns() );
6591 for(
size_t i=0UL; i<M; ++i )
6593 const size_t jbegin( ( IsUpper_v<MT5> )
6594 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
6596 const size_t jend( ( IsLower_v<MT5> )
6597 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
6601 const size_t jnum( jend - jbegin );
6602 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
6604 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
6605 C(i,j ) += A(i,i) * B(i,j ) * scalar;
6606 C(i,j+1UL) += A(i,i) * B(i,j+1UL) * scalar;
6609 C(i,jpos) += A(i,i) * B(i,jpos) * scalar;
6629 template<
typename MT3
6633 static inline auto selectDefaultAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6634 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
6638 for(
size_t i=0UL; i<A.rows(); ++i ) {
6639 C(i,i) += A(i,i) * B(i,i) * scalar;
6658 template<
typename MT3
6662 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6663 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6665 selectDefaultAddAssignKernel( C, A, B, scalar );
6684 template<
typename MT3
6688 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
6689 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
6691 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
6693 const size_t M( A.rows() );
6694 const size_t N( B.columns() );
6695 const size_t K( A.columns() );
6699 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
6702 const SIMDType factor(
set( scalar ) );
6706 if( IsIntegral_v<ElementType> )
6709 for(
size_t i=0UL; i<M; ++i )
6711 const size_t kbegin( ( IsUpper_v<MT4> )
6712 ?( ( IsLower_v<MT5> )
6713 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6714 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6715 :( IsLower_v<MT5> ? j : 0UL ) );
6716 const size_t kend( ( IsLower_v<MT4> )
6717 ?( ( IsUpper_v<MT5> )
6718 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
6719 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
6720 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
6722 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6724 for(
size_t k=kbegin; k<kend; ++k ) {
6725 const SIMDType a1(
set( A(i,k) ) );
6726 xmm1 += a1 * B.load(k,j );
6727 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6728 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6729 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6730 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
6731 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
6732 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
6733 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
6736 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6752 for( ; (i+2UL) <= M; i+=2UL )
6754 const size_t kbegin( ( IsUpper_v<MT4> )
6755 ?( ( IsLower_v<MT5> )
6756 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6757 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6758 :( IsLower_v<MT5> ? j : 0UL ) );
6759 const size_t kend( ( IsLower_v<MT4> )
6760 ?( ( IsUpper_v<MT5> )
6761 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
6762 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6763 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
6765 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
6767 for(
size_t k=kbegin; k<kend; ++k ) {
6768 const SIMDType a1(
set( A(i ,k) ) );
6769 const SIMDType a2(
set( A(i+1UL,k) ) );
6770 const SIMDType b1( B.load(k,j ) );
6771 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6772 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6773 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
6774 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
6787 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6792 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm6 * factor );
6794 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm8 * factor );
6795 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm9 * factor );
6796 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) + xmm10 * factor );
6801 const size_t kbegin( ( IsUpper_v<MT4> )
6802 ?( ( IsLower_v<MT5> )
6803 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6804 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6805 :( IsLower_v<MT5> ? j : 0UL ) );
6806 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
6808 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
6810 for(
size_t k=kbegin; k<kend; ++k ) {
6811 const SIMDType a1(
set( A(i,k) ) );
6812 xmm1 += a1 * B.load(k,j );
6813 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6814 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6815 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6816 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
6819 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6831 for( ; (i+2UL) <= M; i+=2UL )
6833 const size_t kbegin( ( IsUpper_v<MT4> )
6834 ?( ( IsLower_v<MT5> )
6835 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6836 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6837 :( IsLower_v<MT5> ? j : 0UL ) );
6838 const size_t kend( ( IsLower_v<MT4> )
6839 ?( ( IsUpper_v<MT5> )
6840 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
6841 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6842 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
6844 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6846 for(
size_t k=kbegin; k<kend; ++k ) {
6847 const SIMDType a1(
set( A(i ,k) ) );
6848 const SIMDType a2(
set( A(i+1UL,k) ) );
6849 const SIMDType b1( B.load(k,j ) );
6850 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6851 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6852 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
6863 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6867 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm5 * factor );
6869 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm7 * factor );
6870 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) + xmm8 * factor );
6875 const size_t kbegin( ( IsUpper_v<MT4> )
6876 ?( ( IsLower_v<MT5> )
6877 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6878 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6879 :( IsLower_v<MT5> ? j : 0UL ) );
6880 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
6882 SIMDType xmm1, xmm2, xmm3, xmm4;
6884 for(
size_t k=kbegin; k<kend; ++k ) {
6885 const SIMDType a1(
set( A(i,k) ) );
6886 xmm1 += a1 * B.load(k,j );
6887 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6888 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6889 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
6892 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6903 for( ; (i+2UL) <= M; i+=2UL )
6905 const size_t kbegin( ( IsUpper_v<MT4> )
6906 ?( ( IsLower_v<MT5> )
6907 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6908 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6909 :( IsLower_v<MT5> ? j : 0UL ) );
6910 const size_t kend( ( IsLower_v<MT4> )
6911 ?( ( IsUpper_v<MT5> )
6912 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
6913 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
6914 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
6916 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6918 for(
size_t k=kbegin; k<kend; ++k ) {
6919 const SIMDType a1(
set( A(i ,k) ) );
6920 const SIMDType a2(
set( A(i+1UL,k) ) );
6921 const SIMDType b1( B.load(k,j ) );
6922 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
6923 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
6932 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
6935 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm4 * factor );
6937 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) + xmm6 * factor );
6942 const size_t kbegin( ( IsUpper_v<MT4> )
6943 ?( ( IsLower_v<MT5> )
6944 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6945 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6946 :( IsLower_v<MT5> ? j : 0UL ) );
6947 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
6949 SIMDType xmm1, xmm2, xmm3;
6951 for(
size_t k=kbegin; k<kend; ++k ) {
6952 const SIMDType a1(
set( A(i,k) ) );
6953 xmm1 += a1 * B.load(k,j );
6954 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
6955 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
6958 C.store( i, j , C.load(i,j ) + xmm1 * factor );
6966 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
6967 size_t i( LOW ? j : 0UL );
6969 for( ; (i+4UL) <= iend; i+=4UL )
6971 const size_t kbegin( ( IsUpper_v<MT4> )
6972 ?( ( IsLower_v<MT5> )
6973 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
6974 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
6975 :( IsLower_v<MT5> ? j : 0UL ) );
6976 const size_t kend( ( IsLower_v<MT4> )
6977 ?( ( IsUpper_v<MT5> )
6978 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
6979 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
6980 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
6982 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
6984 for(
size_t k=kbegin; k<kend; ++k ) {
6985 const SIMDType a1(
set( A(i ,k) ) );
6986 const SIMDType a2(
set( A(i+1UL,k) ) );
6987 const SIMDType a3(
set( A(i+2UL,k) ) );
6988 const SIMDType a4(
set( A(i+3UL,k) ) );
6989 const SIMDType b1( B.load(k,j ) );
6990 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
7001 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7003 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7005 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7007 C.store( i+3UL, j , C.load(i+3UL,j ) + xmm7 * factor );
7011 for( ; (i+3UL) <= iend; i+=3UL )
7013 const size_t kbegin( ( IsUpper_v<MT4> )
7014 ?( ( IsLower_v<MT5> )
7015 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7016 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7017 :( IsLower_v<MT5> ? j : 0UL ) );
7018 const size_t kend( ( IsLower_v<MT4> )
7019 ?( ( IsUpper_v<MT5> )
7020 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
7021 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
7022 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7024 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7026 for(
size_t k=kbegin; k<kend; ++k ) {
7027 const SIMDType a1(
set( A(i ,k) ) );
7028 const SIMDType a2(
set( A(i+1UL,k) ) );
7029 const SIMDType a3(
set( A(i+2UL,k) ) );
7030 const SIMDType b1( B.load(k,j ) );
7031 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
7040 C.store( i , j , C.load(i ,j ) + xmm1 * factor );
7042 C.store( i+1UL, j , C.load(i+1UL,j ) + xmm3 * factor );
7044 C.store( i+2UL, j , C.load(i+2UL,j ) + xmm5 * factor );
7048 for( ; (i+2UL) <= iend; i+=2UL )
7050 const size_t kbegin( ( IsUpper_v<MT4> )
7051 ?( ( IsLower_v<MT5> )
7052 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7053 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7054 :( IsLower_v<MT5> ? j : 0UL ) );
7055 const size_t kend( ( IsLower_v<MT4> )
7056 ?( ( IsUpper_v<MT5> )
7057 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
7058 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7059 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
7061 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7064 for( ; (k+2UL) <= kend; k+=2UL ) {
7065 const SIMDType a1(
set( A(i ,k ) ) );
7066 const SIMDType a2(
set( A(i+1UL,k ) ) );
7067 const SIMDType a3(
set( A(i ,k+1UL) ) );
7068 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
7069 const SIMDType b1( B.load(k ,j ) );
7070 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
7071 const SIMDType b3( B.load(k+1UL,j ) );
7072 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
7083 for( ; k<kend; ++k ) {
7084 const SIMDType a1(
set( A(i ,k) ) );
7085 const SIMDType a2(
set( A(i+1UL,k) ) );
7086 const SIMDType b1( B.load(k,j ) );
7087 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
7094 C.store( i , j , C.load(i ,j ) + (xmm1+xmm5) * factor );
7096 C.store( i+1UL, j , C.load(i+1UL,j ) + (xmm3+xmm7) * factor );
7097 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) + (xmm4+xmm8) * factor );
7102 const size_t kbegin( ( IsUpper_v<MT4> )
7103 ?( ( IsLower_v<MT5> )
7104 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7105 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7106 :( IsLower_v<MT5> ? j : 0UL ) );
7107 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
7109 SIMDType xmm1, xmm2, xmm3, xmm4;
7112 for( ; (k+2UL) <= kend; k+=2UL ) {
7113 const SIMDType a1(
set( A(i,k ) ) );
7114 const SIMDType a2(
set( A(i,k+1UL) ) );
7115 xmm1 += a1 * B.load(k ,j );
7116 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
7117 xmm3 += a2 * B.load(k+1UL,j );
7118 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
7121 for( ; k<kend; ++k ) {
7122 const SIMDType a1(
set( A(i,k) ) );
7123 xmm1 += a1 * B.load(k,j );
7127 C.store( i, j , C.load(i,j ) + (xmm1+xmm3) * factor );
7134 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
7135 size_t i( LOW ? j : 0UL );
7137 for( ; (i+4UL) <= iend; i+=4UL )
7139 const size_t kbegin( ( IsUpper_v<MT4> )
7140 ?( ( IsLower_v<MT5> )
7141 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7142 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7143 :( IsLower_v<MT5> ? j : 0UL ) );
7144 const size_t kend( ( IsLower_v<MT4> )
7145 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
7148 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7151 for( ; (k+2UL) <= kend; k+=2UL ) {
7152 const SIMDType b1( B.load(k ,j) );
7153 const SIMDType b2( B.load(k+1UL,j) );
7154 xmm1 +=
set( A(i ,k ) ) * b1;
7155 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7156 xmm3 +=
set( A(i+2UL,k ) ) * b1;
7157 xmm4 +=
set( A(i+3UL,k ) ) * b1;
7158 xmm5 +=
set( A(i ,k+1UL) ) * b2;
7159 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
7160 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
7161 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
7164 for( ; k<kend; ++k ) {
7165 const SIMDType b1( B.load(k,j) );
7166 xmm1 +=
set( A(i ,k) ) * b1;
7167 xmm2 +=
set( A(i+1UL,k) ) * b1;
7168 xmm3 +=
set( A(i+2UL,k) ) * b1;
7169 xmm4 +=
set( A(i+3UL,k) ) * b1;
7172 C.store( i , j, C.load(i ,j) + (xmm1+xmm5) * factor );
7173 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm6) * factor );
7174 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm7) * factor );
7175 C.store( i+3UL, j, C.load(i+3UL,j) + (xmm4+xmm8) * factor );
7178 for( ; (i+3UL) <= iend; i+=3UL )
7180 const size_t kbegin( ( IsUpper_v<MT4> )
7181 ?( ( IsLower_v<MT5> )
7182 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7183 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7184 :( IsLower_v<MT5> ? j : 0UL ) );
7185 const size_t kend( ( IsLower_v<MT4> )
7186 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
7189 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
7192 for( ; (k+2UL) <= kend; k+=2UL ) {
7193 const SIMDType b1( B.load(k ,j) );
7194 const SIMDType b2( B.load(k+1UL,j) );
7195 xmm1 +=
set( A(i ,k ) ) * b1;
7196 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7197 xmm3 +=
set( A(i+2UL,k ) ) * b1;
7198 xmm4 +=
set( A(i ,k+1UL) ) * b2;
7199 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
7200 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
7203 for( ; k<kend; ++k ) {
7204 const SIMDType b1( B.load(k,j) );
7205 xmm1 +=
set( A(i ,k) ) * b1;
7206 xmm2 +=
set( A(i+1UL,k) ) * b1;
7207 xmm3 +=
set( A(i+2UL,k) ) * b1;
7210 C.store( i , j, C.load(i ,j) + (xmm1+xmm4) * factor );
7211 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm5) * factor );
7212 C.store( i+2UL, j, C.load(i+2UL,j) + (xmm3+xmm6) * factor );
7215 for( ; (i+2UL) <= iend; i+=2UL )
7217 const size_t kbegin( ( IsUpper_v<MT4> )
7218 ?( ( IsLower_v<MT5> )
7219 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7220 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7221 :( IsLower_v<MT5> ? j : 0UL ) );
7222 const size_t kend( ( IsLower_v<MT4> )
7223 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7226 SIMDType xmm1, xmm2, xmm3, xmm4;
7229 for( ; (k+2UL) <= kend; k+=2UL ) {
7230 const SIMDType b1( B.load(k ,j) );
7231 const SIMDType b2( B.load(k+1UL,j) );
7232 xmm1 +=
set( A(i ,k ) ) * b1;
7233 xmm2 +=
set( A(i+1UL,k ) ) * b1;
7234 xmm3 +=
set( A(i ,k+1UL) ) * b2;
7235 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
7238 for( ; k<kend; ++k ) {
7239 const SIMDType b1( B.load(k,j) );
7240 xmm1 +=
set( A(i ,k) ) * b1;
7241 xmm2 +=
set( A(i+1UL,k) ) * b1;
7244 C.store( i , j, C.load(i ,j) + (xmm1+xmm3) * factor );
7245 C.store( i+1UL, j, C.load(i+1UL,j) + (xmm2+xmm4) * factor );
7250 const size_t kbegin( ( IsUpper_v<MT4> )
7251 ?( ( IsLower_v<MT5> )
7252 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7253 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7254 :( IsLower_v<MT5> ? j : 0UL ) );
7256 SIMDType xmm1, xmm2;
7259 for( ; (k+2UL) <= K; k+=2UL ) {
7260 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
7261 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
7265 xmm1 +=
set( A(i,k) ) * B.load(k,j);
7268 C.store( i, j, C.load(i,j) + (xmm1+xmm2) * factor );
7272 for( ; remainder && j<N; ++j )
7274 const size_t iend( UPP ? j+1UL : M );
7275 size_t i( LOW ? j : 0UL );
7277 for( ; (i+2UL) <= iend; i+=2UL )
7279 const size_t kbegin( ( IsUpper_v<MT4> )
7280 ?( ( IsLower_v<MT5> )
7281 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7282 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7283 :( IsLower_v<MT5> ? j : 0UL ) );
7284 const size_t kend( ( IsLower_v<MT4> )
7285 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
7291 for(
size_t k=kbegin; k<kend; ++k ) {
7292 value1 += A(i ,k) * B(k,j);
7293 value2 += A(i+1UL,k) * B(k,j);
7296 C(i ,j) += value1 * scalar;
7297 C(i+1UL,j) += value2 * scalar;
7302 const size_t kbegin( ( IsUpper_v<MT4> )
7303 ?( ( IsLower_v<MT5> )
7304 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7305 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7306 :( IsLower_v<MT5> ? j : 0UL ) );
7310 for(
size_t k=kbegin; k<K; ++k ) {
7311 value += A(i,k) * B(k,j);
7314 C(i,j) += value * scalar;
7335 template<
typename MT3
7339 static inline auto selectSmallAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7340 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7347 const ForwardFunctor fwd;
7349 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
7350 const OppositeType_t<MT4> tmp(
serial( A ) );
7351 addAssign( C, fwd( tmp * B ) * scalar );
7353 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
7354 const OppositeType_t<MT5> tmp(
serial( B ) );
7355 addAssign( C, fwd( A * tmp ) * scalar );
7357 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
7358 const OppositeType_t<MT4> tmp(
serial( A ) );
7359 addAssign( C, fwd( tmp * B ) * scalar );
7362 const OppositeType_t<MT5> tmp(
serial( B ) );
7363 addAssign( C, fwd( A * tmp ) * scalar );
7382 template<
typename MT3
7386 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7387 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7389 selectDefaultAddAssignKernel( C, A, B, scalar );
7408 template<
typename MT3
7412 static inline auto selectLargeAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7413 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7416 lmmm( C, A, B, scalar, ST2(1) );
7418 ummm( C, A, B, scalar, ST2(1) );
7420 mmm( C, A, B, scalar, ST2(1) );
7438 template<
typename MT3
7442 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7443 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7445 selectLargeAddAssignKernel( C, A, B, scalar );
7450 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 7464 template<
typename MT3
7468 static inline auto selectBlasAddAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7469 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
7471 using ET = ElementType_t<MT3>;
7473 if( IsTriangular_v<MT4> ) {
7474 ResultType_t<MT3> tmp(
serial( B ) );
7475 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7476 addAssign( C, tmp );
7478 else if( IsTriangular_v<MT5> ) {
7479 ResultType_t<MT3> tmp(
serial( A ) );
7480 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
7481 addAssign( C, tmp );
7484 gemm( C, A, B,
ET(scalar),
ET(1) );
7504 template<
typename MT >
7506 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7515 const ForwardFunctor fwd;
7517 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7518 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7520 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
7521 addAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
7522 else if( IsSymmetric_v<MT1> )
7523 addAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
7525 addAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
7545 template<
typename MT
7547 friend inline auto subAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
7548 -> DisableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
7555 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
7556 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
7558 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
7572 DMatScalarMultExpr::selectSubAssignKernel( ~lhs, A, B, rhs.scalar_ );
7587 template<
typename MT3
7591 static inline void selectSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7593 if( ( IsDiagonal_v<MT5> ) ||
7594 ( !BLAZE_DEBUG_MODE && B.columns() <=
SIMDSIZE*10UL ) ||
7595 ( C.rows() * C.columns() < DMATDMATMULT_THRESHOLD ) )
7596 selectSmallSubAssignKernel( C, A, B, scalar );
7598 selectBlasSubAssignKernel( C, A, B, scalar );
7616 template<
typename MT3
7620 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7621 -> EnableIf_t< !IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7624 subAssign( C, tmp );
7642 template<
typename MT3
7646 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7647 -> EnableIf_t< !IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7651 const size_t M( A.rows() );
7652 const size_t N( B.columns() );
7654 for(
size_t i=0UL; i<M; ++i )
7656 const size_t jbegin( ( IsUpper_v<MT4> )
7657 ?( IsStrictlyUpper_v<MT4> ? i+1UL : i )
7659 const size_t jend( ( IsLower_v<MT4> )
7660 ?( IsStrictlyLower_v<MT4> ? i : i+1UL )
7664 const size_t jnum( jend - jbegin );
7665 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7667 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7668 C(i,j ) -= A(i,j ) * B(j ,j ) * scalar;
7669 C(i,j+1UL) -= A(i,j+1UL) * B(j+1UL,j+1UL) * scalar;
7672 C(i,jpos) -= A(i,jpos) * B(jpos,jpos) * scalar;
7692 template<
typename MT3
7696 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7697 -> EnableIf_t< IsDiagonal_v<MT4> && !IsDiagonal_v<MT5> >
7701 const size_t M( A.rows() );
7702 const size_t N( B.columns() );
7704 for(
size_t i=0UL; i<M; ++i )
7706 const size_t jbegin( ( IsUpper_v<MT5> )
7707 ?( IsStrictlyUpper_v<MT5> ? i+1UL : i )
7709 const size_t jend( ( IsLower_v<MT5> )
7710 ?( IsStrictlyLower_v<MT5> ? i : i+1UL )
7714 const size_t jnum( jend - jbegin );
7715 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
7717 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
7718 C(i,j ) -= A(i,i) * B(i,j ) * scalar;
7719 C(i,j+1UL) -= A(i,i) * B(i,j+1UL) * scalar;
7722 C(i,jpos) -= A(i,i) * B(i,jpos) * scalar;
7742 template<
typename MT3
7746 static inline auto selectDefaultSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7747 -> EnableIf_t< IsDiagonal_v<MT4> && IsDiagonal_v<MT5> >
7751 for(
size_t i=0UL; i<A.rows(); ++i ) {
7752 C(i,i) -= A(i,i) * B(i,i) * scalar;
7771 template<
typename MT3
7775 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7776 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7778 selectDefaultSubAssignKernel( C, A, B, scalar );
7797 template<
typename MT3
7801 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
7802 -> EnableIf_t< IsRowMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
7804 constexpr
bool remainder( !IsPadded_v<MT3> || !IsPadded_v<MT5> );
7806 const size_t M( A.rows() );
7807 const size_t N( B.columns() );
7808 const size_t K( A.columns() );
7812 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
7815 const SIMDType factor(
set( scalar ) );
7819 if( IsIntegral_v<ElementType> )
7822 for(
size_t i=0UL; i<M; ++i )
7824 const size_t kbegin( ( IsUpper_v<MT4> )
7825 ?( ( IsLower_v<MT5> )
7826 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7827 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7828 :( IsLower_v<MT5> ? j : 0UL ) );
7829 const size_t kend( ( IsLower_v<MT4> )
7830 ?( ( IsUpper_v<MT5> )
7831 ?(
min( ( IsStrictlyLower_v<MT4> ? i : i+1UL ), j+
SIMDSIZE*8UL, K ) )
7832 :( IsStrictlyLower_v<MT4> ? i : i+1UL ) )
7833 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*8UL, K ) : K ) );
7835 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7837 for(
size_t k=kbegin; k<kend; ++k ) {
7838 const SIMDType a1(
set( A(i,k) ) );
7839 xmm1 += a1 * B.load(k,j );
7840 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7841 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7842 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7843 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7844 xmm6 += a1 * B.load(k,j+
SIMDSIZE*5UL);
7845 xmm7 += a1 * B.load(k,j+
SIMDSIZE*6UL);
7846 xmm8 += a1 * B.load(k,j+
SIMDSIZE*7UL);
7849 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7865 for( ; (i+2UL) <= M; i+=2UL )
7867 const size_t kbegin( ( IsUpper_v<MT4> )
7868 ?( ( IsLower_v<MT5> )
7869 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7870 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7871 :( IsLower_v<MT5> ? j : 0UL ) );
7872 const size_t kend( ( IsLower_v<MT4> )
7873 ?( ( IsUpper_v<MT5> )
7874 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*5UL, K ) )
7875 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7876 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*5UL, K ) : K ) );
7878 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10;
7880 for(
size_t k=kbegin; k<kend; ++k ) {
7881 const SIMDType a1(
set( A(i ,k) ) );
7882 const SIMDType a2(
set( A(i+1UL,k) ) );
7883 const SIMDType b1( B.load(k,j ) );
7884 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7885 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7886 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7887 const SIMDType b5( B.load(k,j+
SIMDSIZE*4UL) );
7900 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7905 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm6 * factor );
7907 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm8 * factor );
7908 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm9 * factor );
7909 C.store( i+1UL, j+
SIMDSIZE*4UL, C.load(i+1UL,j+
SIMDSIZE*4UL) - xmm10 * factor );
7914 const size_t kbegin( ( IsUpper_v<MT4> )
7915 ?( ( IsLower_v<MT5> )
7916 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7917 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7918 :( IsLower_v<MT5> ? j : 0UL ) );
7919 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*5UL, K ) ):( K ) );
7921 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5;
7923 for(
size_t k=kbegin; k<kend; ++k ) {
7924 const SIMDType a1(
set( A(i,k) ) );
7925 xmm1 += a1 * B.load(k,j );
7926 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
7927 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
7928 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
7929 xmm5 += a1 * B.load(k,j+
SIMDSIZE*4UL);
7932 C.store( i, j , C.load(i,j ) - xmm1 * factor );
7944 for( ; (i+2UL) <= M; i+=2UL )
7946 const size_t kbegin( ( IsUpper_v<MT4> )
7947 ?( ( IsLower_v<MT5> )
7948 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7949 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7950 :( IsLower_v<MT5> ? j : 0UL ) );
7951 const size_t kend( ( IsLower_v<MT4> )
7952 ?( ( IsUpper_v<MT5> )
7953 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*4UL, K ) )
7954 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
7955 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*4UL, K ) : K ) );
7957 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
7959 for(
size_t k=kbegin; k<kend; ++k ) {
7960 const SIMDType a1(
set( A(i ,k) ) );
7961 const SIMDType a2(
set( A(i+1UL,k) ) );
7962 const SIMDType b1( B.load(k,j ) );
7963 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
7964 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
7965 const SIMDType b4( B.load(k,j+
SIMDSIZE*3UL) );
7976 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
7980 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm5 * factor );
7982 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm7 * factor );
7983 C.store( i+1UL, j+
SIMDSIZE*3UL, C.load(i+1UL,j+
SIMDSIZE*3UL) - xmm8 * factor );
7988 const size_t kbegin( ( IsUpper_v<MT4> )
7989 ?( ( IsLower_v<MT5> )
7990 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
7991 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
7992 :( IsLower_v<MT5> ? j : 0UL ) );
7993 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*4UL, K ) ):( K ) );
7995 SIMDType xmm1, xmm2, xmm3, xmm4;
7997 for(
size_t k=kbegin; k<kend; ++k ) {
7998 const SIMDType a1(
set( A(i,k) ) );
7999 xmm1 += a1 * B.load(k,j );
8000 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8001 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8002 xmm4 += a1 * B.load(k,j+
SIMDSIZE*3UL);
8005 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8016 for( ; (i+2UL) <= M; i+=2UL )
8018 const size_t kbegin( ( IsUpper_v<MT4> )
8019 ?( ( IsLower_v<MT5> )
8020 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8021 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8022 :( IsLower_v<MT5> ? j : 0UL ) );
8023 const size_t kend( ( IsLower_v<MT4> )
8024 ?( ( IsUpper_v<MT5> )
8025 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*3UL, K ) )
8026 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8027 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*3UL, K ) : K ) );
8029 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8031 for(
size_t k=kbegin; k<kend; ++k ) {
8032 const SIMDType a1(
set( A(i ,k) ) );
8033 const SIMDType a2(
set( A(i+1UL,k) ) );
8034 const SIMDType b1( B.load(k,j ) );
8035 const SIMDType b2( B.load(k,j+
SIMDSIZE ) );
8036 const SIMDType b3( B.load(k,j+
SIMDSIZE*2UL) );
8045 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8048 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm4 * factor );
8050 C.store( i+1UL, j+
SIMDSIZE*2UL, C.load(i+1UL,j+
SIMDSIZE*2UL) - xmm6 * factor );
8055 const size_t kbegin( ( IsUpper_v<MT4> )
8056 ?( ( IsLower_v<MT5> )
8057 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8058 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8059 :( IsLower_v<MT5> ? j : 0UL ) );
8060 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*3UL, K ) ):( K ) );
8062 SIMDType xmm1, xmm2, xmm3;
8064 for(
size_t k=kbegin; k<kend; ++k ) {
8065 const SIMDType a1(
set( A(i,k) ) );
8066 xmm1 += a1 * B.load(k,j );
8067 xmm2 += a1 * B.load(k,j+
SIMDSIZE );
8068 xmm3 += a1 * B.load(k,j+
SIMDSIZE*2UL);
8071 C.store( i, j , C.load(i,j ) - xmm1 * factor );
8079 const size_t iend( UPP ?
min(j+
SIMDSIZE*2UL,M) : M );
8080 size_t i( LOW ? j : 0UL );
8082 for( ; (i+4UL) <= iend; i+=4UL )
8084 const size_t kbegin( ( IsUpper_v<MT4> )
8085 ?( ( IsLower_v<MT5> )
8086 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8087 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8088 :( IsLower_v<MT5> ? j : 0UL ) );
8089 const size_t kend( ( IsLower_v<MT4> )
8090 ?( ( IsUpper_v<MT5> )
8091 ?(
min( ( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ), j+
SIMDSIZE*2UL, K ) )
8092 :( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL ) )
8093 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8095 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8097 for(
size_t k=kbegin; k<kend; ++k ) {
8098 const SIMDType a1(
set( A(i ,k) ) );
8099 const SIMDType a2(
set( A(i+1UL,k) ) );
8100 const SIMDType a3(
set( A(i+2UL,k) ) );
8101 const SIMDType a4(
set( A(i+3UL,k) ) );
8102 const SIMDType b1( B.load(k,j ) );
8103 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8114 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8116 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8118 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8120 C.store( i+3UL, j , C.load(i+3UL,j ) - xmm7 * factor );
8124 for( ; (i+3UL) <= iend; i+=3UL )
8126 const size_t kbegin( ( IsUpper_v<MT4> )
8127 ?( ( IsLower_v<MT5> )
8128 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8129 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8130 :( IsLower_v<MT5> ? j : 0UL ) );
8131 const size_t kend( ( IsLower_v<MT4> )
8132 ?( ( IsUpper_v<MT5> )
8133 ?(
min( ( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ), j+
SIMDSIZE*2UL, K ) )
8134 :( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL ) )
8135 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8137 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8139 for(
size_t k=kbegin; k<kend; ++k ) {
8140 const SIMDType a1(
set( A(i ,k) ) );
8141 const SIMDType a2(
set( A(i+1UL,k) ) );
8142 const SIMDType a3(
set( A(i+2UL,k) ) );
8143 const SIMDType b1( B.load(k,j ) );
8144 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8153 C.store( i , j , C.load(i ,j ) - xmm1 * factor );
8155 C.store( i+1UL, j , C.load(i+1UL,j ) - xmm3 * factor );
8157 C.store( i+2UL, j , C.load(i+2UL,j ) - xmm5 * factor );
8161 for( ; (i+2UL) <= iend; i+=2UL )
8163 const size_t kbegin( ( IsUpper_v<MT4> )
8164 ?( ( IsLower_v<MT5> )
8165 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8166 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8167 :( IsLower_v<MT5> ? j : 0UL ) );
8168 const size_t kend( ( IsLower_v<MT4> )
8169 ?( ( IsUpper_v<MT5> )
8170 ?(
min( ( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ), j+
SIMDSIZE*2UL, K ) )
8171 :( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL ) )
8172 :( IsUpper_v<MT5> ?
min( j+
SIMDSIZE*2UL, K ) : K ) );
8174 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8177 for( ; (k+2UL) <= kend; k+=2UL ) {
8178 const SIMDType a1(
set( A(i ,k ) ) );
8179 const SIMDType a2(
set( A(i+1UL,k ) ) );
8180 const SIMDType a3(
set( A(i ,k+1UL) ) );
8181 const SIMDType a4(
set( A(i+1UL,k+1UL) ) );
8182 const SIMDType b1( B.load(k ,j ) );
8183 const SIMDType b2( B.load(k ,j+
SIMDSIZE) );
8184 const SIMDType b3( B.load(k+1UL,j ) );
8185 const SIMDType b4( B.load(k+1UL,j+
SIMDSIZE) );
8196 for( ; k<kend; ++k ) {
8197 const SIMDType a1(
set( A(i ,k) ) );
8198 const SIMDType a2(
set( A(i+1UL,k) ) );
8199 const SIMDType b1( B.load(k,j ) );
8200 const SIMDType b2( B.load(k,j+
SIMDSIZE) );
8207 C.store( i , j , C.load(i ,j ) - (xmm1+xmm5) * factor );
8209 C.store( i+1UL, j , C.load(i+1UL,j ) - (xmm3+xmm7) * factor );
8210 C.store( i+1UL, j+
SIMDSIZE, C.load(i+1UL,j+
SIMDSIZE) - (xmm4+xmm8) * factor );
8215 const size_t kbegin( ( IsUpper_v<MT4> )
8216 ?( ( IsLower_v<MT5> )
8217 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8218 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8219 :( IsLower_v<MT5> ? j : 0UL ) );
8220 const size_t kend( ( IsUpper_v<MT5> )?(
min( j+
SIMDSIZE*2UL, K ) ):( K ) );
8222 SIMDType xmm1, xmm2, xmm3, xmm4;
8225 for( ; (k+2UL) <= kend; k+=2UL ) {
8226 const SIMDType a1(
set( A(i,k ) ) );
8227 const SIMDType a2(
set( A(i,k+1UL) ) );
8228 xmm1 += a1 * B.load(k ,j );
8229 xmm2 += a1 * B.load(k ,j+
SIMDSIZE);
8230 xmm3 += a2 * B.load(k+1UL,j );
8231 xmm4 += a2 * B.load(k+1UL,j+
SIMDSIZE);
8234 for( ; k<kend; ++k ) {
8235 const SIMDType a1(
set( A(i,k) ) );
8236 xmm1 += a1 * B.load(k,j );
8240 C.store( i, j , C.load(i,j ) - (xmm1+xmm3) * factor );
8247 const size_t iend( LOW && UPP ?
min(j+
SIMDSIZE,M) : M );
8248 size_t i( LOW ? j : 0UL );
8250 for( ; (i+4UL) <= iend; i+=4UL )
8252 const size_t kbegin( ( IsUpper_v<MT4> )
8253 ?( ( IsLower_v<MT5> )
8254 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8255 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8256 :( IsLower_v<MT5> ? j : 0UL ) );
8257 const size_t kend( ( IsLower_v<MT4> )
8258 ?( IsStrictlyLower_v<MT4> ? i+3UL : i+4UL )
8261 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
8264 for( ; (k+2UL) <= kend; k+=2UL ) {
8265 const SIMDType b1( B.load(k ,j) );
8266 const SIMDType b2( B.load(k+1UL,j) );
8267 xmm1 +=
set( A(i ,k ) ) * b1;
8268 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8269 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8270 xmm4 +=
set( A(i+3UL,k ) ) * b1;
8271 xmm5 +=
set( A(i ,k+1UL) ) * b2;
8272 xmm6 +=
set( A(i+1UL,k+1UL) ) * b2;
8273 xmm7 +=
set( A(i+2UL,k+1UL) ) * b2;
8274 xmm8 +=
set( A(i+3UL,k+1UL) ) * b2;
8277 for( ; k<kend; ++k ) {
8278 const SIMDType b1( B.load(k,j) );
8279 xmm1 +=
set( A(i ,k) ) * b1;
8280 xmm2 +=
set( A(i+1UL,k) ) * b1;
8281 xmm3 +=
set( A(i+2UL,k) ) * b1;
8282 xmm4 +=
set( A(i+3UL,k) ) * b1;
8285 C.store( i , j, C.load(i ,j) - (xmm1+xmm5) * factor );
8286 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm6) * factor );
8287 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm7) * factor );
8288 C.store( i+3UL, j, C.load(i+3UL,j) - (xmm4+xmm8) * factor );
8291 for( ; (i+3UL) <= iend; i+=3UL )
8293 const size_t kbegin( ( IsUpper_v<MT4> )
8294 ?( ( IsLower_v<MT5> )
8295 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8296 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8297 :( IsLower_v<MT5> ? j : 0UL ) );
8298 const size_t kend( ( IsLower_v<MT4> )
8299 ?( IsStrictlyLower_v<MT4> ? i+2UL : i+3UL )
8302 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
8305 for( ; (k+2UL) <= kend; k+=2UL ) {
8306 const SIMDType b1( B.load(k ,j) );
8307 const SIMDType b2( B.load(k+1UL,j) );
8308 xmm1 +=
set( A(i ,k ) ) * b1;
8309 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8310 xmm3 +=
set( A(i+2UL,k ) ) * b1;
8311 xmm4 +=
set( A(i ,k+1UL) ) * b2;
8312 xmm5 +=
set( A(i+1UL,k+1UL) ) * b2;
8313 xmm6 +=
set( A(i+2UL,k+1UL) ) * b2;
8316 for( ; k<kend; ++k ) {
8317 const SIMDType b1( B.load(k,j) );
8318 xmm1 +=
set( A(i ,k) ) * b1;
8319 xmm2 +=
set( A(i+1UL,k) ) * b1;
8320 xmm3 +=
set( A(i+2UL,k) ) * b1;
8323 C.store( i , j, C.load(i ,j) - (xmm1+xmm4) * factor );
8324 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm5) * factor );
8325 C.store( i+2UL, j, C.load(i+2UL,j) - (xmm3+xmm6) * factor );
8328 for( ; (i+2UL) <= iend; i+=2UL )
8330 const size_t kbegin( ( IsUpper_v<MT4> )
8331 ?( ( IsLower_v<MT5> )
8332 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8333 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8334 :( IsLower_v<MT5> ? j : 0UL ) );
8335 const size_t kend( ( IsLower_v<MT4> )
8336 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8339 SIMDType xmm1, xmm2, xmm3, xmm4;
8342 for( ; (k+2UL) <= kend; k+=2UL ) {
8343 const SIMDType b1( B.load(k ,j) );
8344 const SIMDType b2( B.load(k+1UL,j) );
8345 xmm1 +=
set( A(i ,k ) ) * b1;
8346 xmm2 +=
set( A(i+1UL,k ) ) * b1;
8347 xmm3 +=
set( A(i ,k+1UL) ) * b2;
8348 xmm4 +=
set( A(i+1UL,k+1UL) ) * b2;
8351 for( ; k<kend; ++k ) {
8352 const SIMDType b1( B.load(k,j) );
8353 xmm1 +=
set( A(i ,k) ) * b1;
8354 xmm2 +=
set( A(i+1UL,k) ) * b1;
8357 C.store( i , j, C.load(i ,j) - (xmm1+xmm3) * factor );
8358 C.store( i+1UL, j, C.load(i+1UL,j) - (xmm2+xmm4) * factor );
8363 const size_t kbegin( ( IsUpper_v<MT4> )
8364 ?( ( IsLower_v<MT5> )
8365 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8366 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8367 :( IsLower_v<MT5> ? j : 0UL ) );
8369 SIMDType xmm1, xmm2;
8372 for( ; (k+2UL) <= K; k+=2UL ) {
8373 xmm1 +=
set( A(i,k ) ) * B.load(k ,j);
8374 xmm2 +=
set( A(i,k+1UL) ) * B.load(k+1UL,j);
8378 xmm1 +=
set( A(i,k) ) * B.load(k,j);
8381 C.store( i, j, C.load(i,j) - (xmm1+xmm2) * factor );
8385 for( ; remainder && j<N; ++j )
8387 const size_t iend( UPP ? j+1UL : M );
8388 size_t i( LOW ? j : 0UL );
8390 for( ; (i+2UL) <= iend; i+=2UL )
8392 const size_t kbegin( ( IsUpper_v<MT4> )
8393 ?( ( IsLower_v<MT5> )
8394 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8395 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8396 :( IsLower_v<MT5> ? j : 0UL ) );
8397 const size_t kend( ( IsLower_v<MT4> )
8398 ?( IsStrictlyLower_v<MT4> ? i+1UL : i+2UL )
8404 for(
size_t k=kbegin; k<kend; ++k ) {
8405 value1 += A(i ,k) * B(k,j);
8406 value2 += A(i+1UL,k) * B(k,j);
8409 C(i ,j) -= value1 * scalar;
8410 C(i+1UL,j) -= value2 * scalar;
8415 const size_t kbegin( ( IsUpper_v<MT4> )
8416 ?( ( IsLower_v<MT5> )
8417 ?(
max( ( IsStrictlyUpper_v<MT4> ? i+1UL : i ), j ) )
8418 :( IsStrictlyUpper_v<MT4> ? i+1UL : i ) )
8419 :( IsLower_v<MT5> ? j : 0UL ) );
8423 for(
size_t k=kbegin; k<K; ++k ) {
8424 value += A(i,k) * B(k,j);
8427 C(i,j) -= value * scalar;
8448 template<
typename MT3
8452 static inline auto selectSmallSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8453 -> EnableIf_t< IsColumnMajorMatrix_v<MT3> && UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8460 const ForwardFunctor fwd;
8462 if( !IsResizable_v<MT4> && IsResizable_v<MT5> ) {
8463 const OppositeType_t<MT4> tmp(
serial( A ) );
8464 subAssign( C, fwd( tmp * B ) * scalar );
8466 else if( IsResizable_v<MT4> && !IsResizable_v<MT5> ) {
8467 const OppositeType_t<MT5> tmp(
serial( B ) );
8468 subAssign( C, fwd( A * tmp ) * scalar );
8470 else if( A.rows() * A.columns() <= B.rows() * B.columns() ) {
8471 const OppositeType_t<MT4> tmp(
serial( A ) );
8472 subAssign( C, fwd( tmp * B ) * scalar );
8475 const OppositeType_t<MT5> tmp(
serial( B ) );
8476 subAssign( C, fwd( A * tmp ) * scalar );
8495 template<
typename MT3
8499 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8500 -> DisableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8502 selectDefaultSubAssignKernel( C, A, B, scalar );
8521 template<
typename MT3
8525 static inline auto selectLargeSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8526 -> EnableIf_t< UseVectorizedDefaultKernel_v<MT3,MT4,MT5,ST2> >
8529 lmmm( C, A, B, -scalar, ST2(1) );
8531 ummm( C, A, B, -scalar, ST2(1) );
8533 mmm( C, A, B, -scalar, ST2(1) );
8551 template<
typename MT3
8555 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8556 -> DisableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8558 selectLargeSubAssignKernel( C, A, B, scalar );
8563 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_MATRIX_MULTIPLICATION 8577 template<
typename MT3
8581 static inline auto selectBlasSubAssignKernel( MT3& C,
const MT4& A,
const MT5& B, ST2 scalar )
8582 -> EnableIf_t< UseBlasKernel_v<MT3,MT4,MT5,ST2> >
8584 using ET = ElementType_t<MT3>;
8586 if( IsTriangular_v<MT4> ) {
8587 ResultType_t<MT3> tmp(
serial( B ) );
8588 trmm( tmp, A, CblasLeft, ( IsLower_v<MT4> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8589 subAssign( C, tmp );
8591 else if( IsTriangular_v<MT5> ) {
8592 ResultType_t<MT3> tmp(
serial( A ) );
8593 trmm( tmp, B, CblasRight, ( IsLower_v<MT5> )?( CblasLower ):( CblasUpper ),
ET(scalar) );
8594 subAssign( C, tmp );
8597 gemm( C, A, B,
ET(-scalar),
ET(1) );
8617 template<
typename MT >
8619 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8628 const ForwardFunctor fwd;
8630 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8631 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8633 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8634 subAssign( ~lhs, fwd(
trans( left ) *
trans( right ) ) * rhs.scalar_ );
8635 else if( IsSymmetric_v<MT1> )
8636 subAssign( ~lhs, fwd(
trans( left ) * right ) * rhs.scalar_ );
8638 subAssign( ~lhs, fwd( left *
trans( right ) ) * rhs.scalar_ );
8658 template<
typename MT
8660 friend inline void schurAssign( DenseMatrix<MT,SO>& lhs,
const DMatScalarMultExpr& rhs )
8672 schurAssign( ~lhs, tmp );
8703 template<
typename MT
8706 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8713 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8714 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8716 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL ) {
8719 else if( left.columns() == 0UL ) {
8753 template<
typename MT
8756 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8760 using TmpType = If_t< SO, OppositeType, ResultType >;
8772 const ForwardFunctor fwd;
8774 const TmpType tmp( rhs );
8793 template<
typename MT >
8795 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8804 const ForwardFunctor fwd;
8806 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8807 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8809 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8811 else if( IsSymmetric_v<MT1> )
8833 template<
typename MT
8836 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8843 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8844 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8846 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8878 template<
typename MT >
8880 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8889 const ForwardFunctor fwd;
8891 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8892 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8894 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8896 else if( IsSymmetric_v<MT1> )
8922 template<
typename MT
8925 -> EnableIf_t< IsEvaluationRequired_v<MT,MT1,MT2> >
8932 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8933 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8935 if( (~lhs).rows() == 0UL || (~lhs).
columns() == 0UL || left.columns() == 0UL ) {
8967 template<
typename MT >
8969 -> EnableIf_t< CanExploitSymmetry_v<MT,MT1,MT2> >
8978 const ForwardFunctor fwd;
8980 LeftOperand_t<MMM> left ( rhs.matrix_.leftOperand() );
8981 RightOperand_t<MMM> right( rhs.matrix_.rightOperand() );
8983 if( IsSymmetric_v<MT1> && IsSymmetric_v<MT2> )
8985 else if( IsSymmetric_v<MT1> )
9008 template<
typename MT
9088 template<
typename MT1
9090 inline decltype(
auto)
9100 return ReturnType( ~lhs, ~rhs );
9136 template<
typename MT1
9142 inline decltype(
auto)
declsym( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9150 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,true,HF,LF,UF>;
9151 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9180 template<
typename MT1
9186 inline decltype(
auto)
declherm( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9194 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,true,LF,UF>;
9195 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9224 template<
typename MT1
9230 inline decltype(
auto)
decllow( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9238 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,UF>;
9239 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9268 template<
typename MT1
9274 inline decltype(
auto)
declupp( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9282 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,true>;
9283 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9312 template<
typename MT1
9318 inline decltype(
auto)
decldiag( const DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>& dm )
9326 using ReturnType =
const DMatDMatMultExpr<MT1,MT2,SF,HF,true,true>;
9327 return ReturnType( dm.leftOperand(), dm.rightOperand() );
9343 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9344 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 0UL >
9345 :
public Size<MT1,0UL>
9348 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9349 struct Size< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF>, 1UL >
9350 :
public Size<MT2,1UL>
9366 template<
typename MT1,
typename MT2,
bool SF,
bool HF,
bool LF,
bool UF >
9367 struct IsAligned< DMatDMatMultExpr<MT1,MT2,SF,HF,LF,UF> >
9368 :
public BoolConstant< IsAligned_v<MT1> && IsAligned_v<MT2> >
DMatDMatMultExpr(const MT1 &lhs, const MT2 &rhs) noexcept
Constructor for the DMatDMatMultExpr class.
Definition: DMatDMatMultExpr.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatScalarMultExpr.h:427
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the decldiag trait.
Constraint on the data type.
Header file for kernel specific block sizes.
decltype(auto) decldiag(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as diagonal.
Definition: DMatDeclDiagExpr.h:975
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatDMatMultExpr.h:421
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDMatMultExpr.h:291
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDMatMultExpr.h:316
Header file for basic type definitions.
Header file for the SparseVector base class.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
LeftOperand lhs_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:497
Header file for the declherm trait.
static constexpr bool evaluateRight
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:172
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Base template for the DeclUppTrait class.
Definition: DeclUppTrait.h:134
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatDMatMultExpr.h:287
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDMatMultExpr.h:309
size_t columns() const noexcept
Returns the current number of columns of the matrix.
Definition: DMatScalarMultExpr.h:533
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the DeclUpp functor.
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
Header file for the IsColumnMajorMatrix type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DMatScalarMultExpr.h:606
Header file for the dense matrix multiplication kernels.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDMatMultExpr.h:288
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatScalarMultExpr.h:523
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDMatMultExpr.h:465
Header file for the IsIntegral type trait.
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
Header file for the DenseVector base class.
static constexpr bool SYM
Flag for symmetric matrices.
Definition: DMatDMatMultExpr.h:176
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
CompositeType_t< MT2 > CT2
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:162
decltype(auto) declupp(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as upper.
Definition: DMatDeclUppExpr.h:1001
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatScalarMultExpr.h:597
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DMatScalarMultExpr.h:159
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Expression object for dense matrix-dense matrix multiplications.The DMatDMatMultExpr class represents...
Definition: DMatDMatMultExpr.h:151
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatScalarMultExpr.h:432
Header file for the IsBLASCompatible type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:441
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
Header file for the IsComplexDouble type trait.
Constraint on the data type.
static constexpr bool evaluateLeft
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:167
Headerfile for the generic max algorithm.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatScalarMultExpr.h:565
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsSymmetric type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the DeclLow functor.
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
If_t< evaluateLeft, const RT1, CT1 > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:301
ResultType_t< MT2 > RT2
Result type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:158
Generic wrapper for the decllow() function.
Definition: DeclLow.h:59
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
Header file for the decllow trait.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ElementType_t< RT2 > ET2
Element type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:160
Header file for the HasSIMDAdd type trait.
Header file for the DenseMatrix base class.
static constexpr bool LOW
Flag for lower matrices.
Definition: DMatDMatMultExpr.h:178
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DMatScalarMultExpr.h:165
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDMatMultExpr.h:485
Header file for all SIMD functionality.
If_t< useAssign, const ResultType, const DMatScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DMatScalarMultExpr.h:168
If_t< evaluateRight, const RT2, CT2 > RT
Type for the assignment of the right-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:304
ElementType_t< RT1 > ET1
Element type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:159
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
decltype(auto) decllow(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as lower.
Definition: DMatDeclLowExpr.h:1001
Header file for the IsLower type trait.
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatDMatMultExpr.h:346
Header file for the IsAligned type trait.
ResultType_t< MT1 > RT1
Result type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:157
Header file for the IsStrictlyTriangular type trait.
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DMatScalarMultExpr.h:553
Generic wrapper for the null function.
Definition: Noop.h:60
Header file for the IsTriangular type trait.
Base template for the DeclSymTrait class.
Definition: DeclSymTrait.h:134
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatScalarMultExpr.h:162
Constraints on the storage order of matrix types.
DenseMatrix< This, SO > BaseType
Base type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:158
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
LeftOperand matrix_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatScalarMultExpr.h:605
MT::Iterator end(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator just past the last element of row/column i.
Definition: Matrix.h:438
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDMatMultExpr.h:290
Header file for the DeclDiag functor.
Constraint on the data type.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/matrix ...
Definition: MatMatMultExpr.h:103
OppositeType_t< ResultType > OppositeType
Result type with opposite storage order for expression template evaluations.
Definition: DMatScalarMultExpr.h:160
typename T::OppositeType OppositeType_t
Alias declaration for nested OppositeType type definitions.The OppositeType_t alias declaration provi...
Definition: Aliases.h:270
Header file for the conjugate shim.
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatScalarMultExpr.h:469
CompositeType_t< MT1 > CT1
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:161
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
Header file for the declupp trait.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDMatMultExpr.h:322
Base class for all matrix/matrix multiplication expression templates.The MatMatMultExpr class serves ...
Definition: MatMatMultExpr.h:67
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type,...
Definition: Symmetric.h:79
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatScalarMultExpr.h:161
Header file for the MatScalarMultExpr base class.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DMatScalarMultExpr.h:174
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
ReturnType at(size_t i, size_t j) const
Checked access to the matrix elements.
Definition: DMatDMatMultExpr.h:395
Header file for run time assertion macros.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDMatMultExpr.h:289
Base template for the DeclHermTrait class.
Definition: DeclHermTrait.h:134
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
Base template for the MultTrait class.
Definition: MultTrait.h:146
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool HERM
Flag for Hermitian matrices.
Definition: DMatDMatMultExpr.h:177
typename If_t< HERM, DeclHermTrait< MultTrait_t< RT1, RT2 > >, If_t< SYM, DeclSymTrait< MultTrait_t< RT1, RT2 > >, If_t< LOW, If_t< UPP, DeclDiagTrait< MultTrait_t< RT1, RT2 > >, DeclLowTrait< MultTrait_t< RT1, RT2 > > >, If_t< UPP, DeclUppTrait< MultTrait_t< RT1, RT2 > >, MultTrait< RT1, RT2 > > > > >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDMatMultExpr.h:285
Header file for the IsContiguous type trait.
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatScalarMultExpr.h:422
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
Header file for the declsym trait.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
decltype(auto) declsym(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as symmetric.
Definition: DMatDeclSymExpr.h:1002
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Constraint on the data type.
Constraints on the storage order of matrix types.
Generic wrapper for the declherm() function.
Definition: DeclHerm.h:59
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
If_t< IsExpression_v< MT2 >, const MT2, const MT2 & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:298
Header file for the HasMutableDataAccess type trait.
Header file for the Noop functor.
ResultType_t< MT > RT
Result type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDMatMultExpr.h:292
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDMatMultExpr.h:453
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
auto smpSchurAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP Schur product assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:194
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS triangular matrix/matrix multiplication functions (trmm)
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatScalarMultExpr.h:577
#define BLAZE_CONSTRAINT_MATRICES_MUST_HAVE_SAME_STORAGE_ORDER(T1, T2)
Constraint on the data type.In case either of the two given data types T1 or T2 is not a matrix type ...
Definition: StorageOrder.h:84
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDMatMultExpr.h:431
Generic wrapper for the declupp() function.
Definition: DeclUpp.h:59
decltype(auto) trans(const DenseMatrix< MT, SO > &dm)
Calculation of the transpose of the given dense matrix.
Definition: DMatTransExpr.h:765
Base template for the DeclLowTrait class.
Definition: DeclLowTrait.h:134
decltype(auto) declherm(const DenseMatrix< MT, SO > &dm)
Declares the given dense matrix expression dm as Hermitian.
Definition: DMatDeclHermExpr.h:1002
Header file for the IsRowMajorMatrix type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDMatMultExpr.h:475
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
If_t< IsExpression_v< MT1 >, const MT1, const MT1 & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDMatMultExpr.h:295
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
ReturnType operator()(size_t i, size_t j) const
2D-access to the matrix elements.
Definition: DMatScalarMultExpr.h:454
DMatScalarMultExpr(const MT &matrix, ST scalar) noexcept
Constructor for the DMatScalarMultExpr class.
Definition: DMatScalarMultExpr.h:441
Header file for BLAS general matrix/matrix multiplication functions (gemm)
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Generic wrapper for the decldiag() function.
Definition: DeclDiag.h:59
Header file for the IsComplex type trait.
Header file for the DeclHerm functor.
Header file for the complex data type.
DMatScalarMultExpr< MT, ST, SO > This
Type of this DMatScalarMultExpr instance.
Definition: DMatScalarMultExpr.h:157
size_t rows() const noexcept
Returns the current number of rows of the matrix.
Definition: DMatDMatMultExpr.h:411
static constexpr bool UPP
Flag for upper matrices.
Definition: DMatDMatMultExpr.h:179
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatScalarMultExpr.h:587
ElementType_t< MT > ET
Element type of the dense matrix expression.
Definition: DMatScalarMultExpr.h:107
Header file for the IsUpper type trait.
typename DisableIf< Condition, T >::Type DisableIf_t
Auxiliary type for the DisableIf class template.The DisableIf_t alias declaration provides a convenie...
Definition: DisableIf.h:138
decltype(auto) conj(const DenseMatrix< MT, SO > &dm)
Returns a matrix containing the complex conjugate of each single element of dm.
Definition: DMatMapExpr.h:1324
System settings for the debugging policy of the Blaze library.
Constraint on the data type.
Generic wrapper for the declsym() function.
Definition: DeclSym.h:59
Base template for the DeclDiagTrait class.
Definition: DeclDiagTrait.h:134
bool isSquare(const Matrix< MT, SO > &matrix) noexcept
Checks if the given matrix is a square matrix.
Definition: Matrix.h:951
Header file for the IsResizable type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatScalarMultExpr.h:543
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatScalarMultExpr.h:171
RightOperand rhs_
Right-hand side dense matrix of the multiplication expression.
Definition: DMatDMatMultExpr.h:498
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
Header file for the DeclSym functor.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.