35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 121 template<
typename VT
123 class TDVecDMatMultExpr
124 :
public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
125 ,
private Computation
139 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
145 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
155 template<
typename T1 >
165 template<
typename T1,
typename T2,
typename T3 >
166 static constexpr
bool UseBlasKernel_v =
168 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173 IsBLASCompatible_v< ElementType_t<T1> > &&
174 IsBLASCompatible_v< ElementType_t<T2> > &&
175 IsBLASCompatible_v< ElementType_t<T3> > &&
187 template<
typename T1,
typename T2,
typename T3 >
188 static constexpr
bool UseVectorizedDefaultKernel_v =
189 ( useOptimizedKernels &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192 IsSIMDCombinable_v< ElementType_t<T1>
227 ( !IsDiagonal_v<MT> &&
228 VT::simdEnabled && MT::simdEnabled &&
229 HasSIMDAdd_v<VET,MET> &&
230 HasSIMDMult_v<VET,MET> );
265 if( IsDiagonal_v<MT> )
267 return vec_[index] *
mat_(index,index);
269 else if( IsLower_v<MT> && ( index > 8UL ) )
271 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
276 else if( IsUpper_v<MT> && ( index + 8UL <
mat_.rows() ) )
278 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
297 if( index >=
mat_.columns() ) {
300 return (*
this)[index];
309 inline size_t size() const noexcept {
310 return mat_.columns();
340 template<
typename T >
341 inline bool canAlias(
const T* alias )
const noexcept {
342 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
352 template<
typename T >
353 inline bool isAliased(
const T* alias )
const noexcept {
354 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
364 return vec_.isAligned() &&
mat_.isAligned();
378 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
402 template<
typename VT1 >
409 if( rhs.mat_.rows() == 0UL ) {
413 else if( rhs.mat_.columns() == 0UL ) {
425 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
441 template<
typename VT1
444 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
446 if( ( IsDiagonal_v<MT1> ) ||
448 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449 selectSmallAssignKernel( y, x, A );
451 selectBlasAssignKernel( y, x, A );
470 template<
typename VT1
473 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
475 const size_t M( A.rows() );
476 const size_t N( A.columns() );
478 if( IsStrictlyUpper_v<MT1> ) {
482 if( !IsLower_v<MT1> )
484 const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
485 for(
size_t j=jbegin; j<N; ++j ) {
486 y[j] = x[0UL] * A(0UL,j);
490 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
492 if( IsDiagonal_v<MT1> )
494 y[i] = x[i] * A(i,i);
498 const size_t jbegin( ( IsUpper_v<MT1> )
499 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
501 const size_t jend( ( IsLower_v<MT1> )
502 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
506 const size_t jnum( jend - jbegin );
507 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
509 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
510 y[j ] += x[i] * A(i,j );
511 y[j+1UL] += x[i] * A(i,j+1UL);
514 y[jpos] += x[i] * A(i,jpos);
516 if( IsLower_v<MT1> ) {
517 y[jend] = x[i] * A(i,jend);
522 if( IsStrictlyLower_v<MT1> ) {
543 template<
typename VT1
546 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
547 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
549 selectDefaultAssignKernel( y, x, A );
568 template<
typename VT1
571 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
572 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
574 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
576 const size_t M( A.rows() );
577 const size_t N( A.columns() );
579 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
586 const size_t ibegin( ( IsLower_v<MT1> )
587 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
589 const size_t iend( ( IsUpper_v<MT1> )
590 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
596 for(
size_t i=ibegin; i<iend; ++i ) {
598 xmm1 += x1 * A.load(i,j );
600 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
601 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
602 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
603 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
604 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
605 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
620 const size_t ibegin( ( IsLower_v<MT1> )
621 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
623 const size_t iend( ( IsUpper_v<MT1> )
624 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
630 for(
size_t i=ibegin; i<iend; ++i ) {
632 xmm1 += x1 * A.load(i,j );
634 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
635 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
646 const size_t ibegin( ( IsLower_v<MT1> )
647 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
649 const size_t iend( ( IsUpper_v<MT1> )
650 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
656 for(
size_t i=ibegin; i<iend; ++i ) {
658 xmm1 += x1 * A.load(i,j );
660 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
670 const size_t ibegin( ( IsLower_v<MT1> )
671 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
673 const size_t iend( ( IsUpper_v<MT1> )
674 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
680 for(
size_t i=ibegin; i<iend; ++i ) {
682 xmm1 += x1 * A.load(i,j );
692 const size_t ibegin( ( IsLower_v<MT1> )
693 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
695 const size_t iend( ( IsUpper_v<MT1> )
696 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
702 for(
size_t i=ibegin; i<iend; ++i ) {
703 xmm1 +=
set( x[i] ) * A.load(i,j);
709 for( ; remainder && j<N; ++j )
711 const size_t ibegin( ( IsLower_v<MT1> )
712 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
714 const size_t iend( ( IsUpper_v<MT1> )
715 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
721 for(
size_t i=ibegin; i<iend; ++i ) {
722 value += x[i] * A(i,j);
745 template<
typename VT1
748 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
749 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
751 selectDefaultAssignKernel( y, x, A );
770 template<
typename VT1
773 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
774 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
776 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
778 const size_t M( A.rows() );
779 const size_t N( A.columns() );
781 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
782 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
788 for(
size_t jj=0U; jj<N; jj+=jblock ) {
789 for(
size_t ii=0UL; ii<M; ii+=iblock )
791 const size_t iend(
min( ii+iblock, M ) );
792 const size_t jtmp(
min( jj+jblock, N ) );
793 const size_t jend( ( IsLower_v<MT1> )
794 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
797 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
800 size_t j( ( IsUpper_v<MT1> )
801 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
806 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
808 for(
size_t i=ii; i<iend; ++i ) {
810 xmm1 += x1 * A.load(i,j );
812 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
813 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
814 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
815 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
816 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
817 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
820 y.store( j , y.load(j ) + xmm1 );
834 for(
size_t i=ii; i<iend; ++i ) {
836 xmm1 += x1 * A.load(i,j );
838 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
839 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
842 y.store( j , y.load(j ) + xmm1 );
852 for(
size_t i=ii; i<iend; ++i ) {
854 xmm1 += x1 * A.load(i,j );
856 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
859 y.store( j , y.load(j ) + xmm1 );
868 for(
size_t i=ii; i<iend; ++i ) {
870 xmm1 += x1 * A.load(i,j );
874 y.store( j , y.load(j ) + xmm1 );
882 for(
size_t i=ii; i<iend; ++i ) {
883 xmm1 +=
set( x[i] ) * A.load(i,j);
886 y.store( j, y.load(j) + xmm1 );
889 for( ; remainder && j<jend; ++j )
893 for(
size_t i=ii; i<iend; ++i ) {
894 value += x[i] * A(i,j);
919 template<
typename VT1
922 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
923 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
925 selectLargeAssignKernel( y, x, A );
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 945 template<
typename VT1
948 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
949 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
951 using ET = ElementType_t<VT1>;
953 if( IsTriangular_v<MT1> ) {
955 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
958 gemv( y, x, A, ET(1), ET(0) );
978 template<
typename VT1 >
979 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1008 template<
typename VT1 >
1009 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1015 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1027 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1043 template<
typename VT1
1046 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1048 if( ( IsDiagonal_v<MT1> ) ||
1050 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051 selectSmallAddAssignKernel( y, x, A );
1053 selectBlasAddAssignKernel( y, x, A );
1072 template<
typename VT1
1075 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1077 const size_t M( A.rows() );
1078 const size_t N( A.columns() );
1080 for(
size_t i=0UL; i<M; ++i )
1082 if( IsDiagonal_v<MT1> )
1084 y[i] += x[i] * A(i,i);
1088 const size_t jbegin( ( IsUpper_v<MT1> )
1089 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1091 const size_t jend( ( IsLower_v<MT1> )
1092 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1096 const size_t jnum( jend - jbegin );
1097 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1099 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1100 y[j ] += x[i] * A(i,j );
1101 y[j+1UL] += x[i] * A(i,j+1UL);
1104 y[jpos] += x[i] * A(i,jpos);
1126 template<
typename VT1
1129 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1130 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1132 selectDefaultAddAssignKernel( y, x, A );
1151 template<
typename VT1
1154 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1155 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1157 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1159 const size_t M( A.rows() );
1160 const size_t N( A.columns() );
1162 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1169 const size_t ibegin( ( IsLower_v<MT1> )
1170 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1172 const size_t iend( ( IsUpper_v<MT1> )
1173 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1186 for(
size_t i=ibegin; i<iend; ++i ) {
1188 xmm1 += x1 * A.load(i,j );
1189 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1190 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1191 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1192 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1193 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1194 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1195 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1198 y.store( j , xmm1 );
1210 const size_t ibegin( ( IsLower_v<MT1> )
1211 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1213 const size_t iend( ( IsUpper_v<MT1> )
1214 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1223 for(
size_t i=ibegin; i<iend; ++i ) {
1225 xmm1 += x1 * A.load(i,j );
1226 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1227 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1228 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1231 y.store( j , xmm1 );
1239 const size_t ibegin( ( IsLower_v<MT1> )
1240 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1242 const size_t iend( ( IsUpper_v<MT1> )
1243 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1251 for(
size_t i=ibegin; i<iend; ++i ) {
1253 xmm1 += x1 * A.load(i,j );
1254 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1255 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1258 y.store( j , xmm1 );
1265 const size_t ibegin( ( IsLower_v<MT1> )
1266 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1268 const size_t iend( ( IsUpper_v<MT1> )
1269 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1276 for(
size_t i=ibegin; i<iend; ++i ) {
1278 xmm1 += x1 * A.load(i,j );
1282 y.store( j , xmm1 );
1288 const size_t ibegin( ( IsLower_v<MT1> )
1289 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1291 const size_t iend( ( IsUpper_v<MT1> )
1292 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1298 for(
size_t i=ibegin; i<iend; ++i ) {
1299 xmm1 +=
set( x[i] ) * A.load(i,j);
1305 for( ; remainder && j<N; ++j )
1307 const size_t ibegin( ( IsLower_v<MT1> )
1308 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1310 const size_t iend( ( IsUpper_v<MT1> )
1311 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1317 for(
size_t i=ibegin; i<iend; ++i ) {
1318 value += x[i] * A(i,j);
1341 template<
typename VT1
1344 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1345 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1347 selectDefaultAddAssignKernel( y, x, A );
1366 template<
typename VT1
1369 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1370 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1372 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1377 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1378 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1382 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1383 for(
size_t ii=0UL; ii<M; ii+=iblock )
1385 const size_t iend(
min( ii+iblock, M ) );
1386 const size_t jtmp(
min( jj+jblock, N ) );
1387 const size_t jend( ( IsLower_v<MT1> )
1388 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1391 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1394 size_t j( ( IsUpper_v<MT1> )
1395 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1402 for(
size_t i=ii; i<iend; ++i ) {
1404 xmm1 += x1 * A.load(i,j );
1405 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1406 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1407 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1408 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1409 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1410 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1411 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1414 y.store( j , y.load(j ) + xmm1 );
1428 for(
size_t i=ii; i<iend; ++i ) {
1430 xmm1 += x1 * A.load(i,j );
1431 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1432 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1433 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1436 y.store( j , y.load(j ) + xmm1 );
1446 for(
size_t i=ii; i<iend; ++i ) {
1448 xmm1 += x1 * A.load(i,j );
1449 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1450 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1453 y.store( j , y.load(j ) + xmm1 );
1462 for(
size_t i=ii; i<iend; ++i ) {
1464 xmm1 += x1 * A.load(i,j );
1468 y.store( j , y.load(j ) + xmm1 );
1476 for(
size_t i=ii; i<iend; ++i ) {
1477 xmm1 +=
set( x[i] ) * A.load(i,j);
1480 y.store( j, y.load(j) + xmm1 );
1483 for( ; remainder && j<jend; ++j )
1487 for(
size_t i=ii; i<iend; ++i ) {
1488 value += x[i] * A(i,j);
1513 template<
typename VT1
1516 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1517 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1519 selectLargeAddAssignKernel( y, x, A );
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1539 template<
typename VT1
1542 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1543 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1545 using ET = ElementType_t<VT1>;
1547 if( IsTriangular_v<MT1> ) {
1548 ResultType_t<VT1> tmp(
serial( x ) );
1549 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1550 addAssign( y, tmp );
1553 gemv( y, x, A, ET(1), ET(1) );
1577 template<
typename VT1 >
1578 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1584 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1596 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1612 template<
typename VT1
1615 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1617 if( ( IsDiagonal_v<MT1> ) ||
1619 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620 selectSmallSubAssignKernel( y, x, A );
1622 selectBlasSubAssignKernel( y, x, A );
1641 template<
typename VT1
1644 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1646 const size_t M( A.rows() );
1647 const size_t N( A.columns() );
1649 for(
size_t i=0UL; i<M; ++i )
1651 if( IsDiagonal_v<MT1> )
1653 y[i] -= x[i] * A(i,i);
1657 const size_t jbegin( ( IsUpper_v<MT1> )
1658 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1660 const size_t jend( ( IsLower_v<MT1> )
1661 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1665 const size_t jnum( jend - jbegin );
1666 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1668 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1669 y[j ] -= x[i] * A(i,j );
1670 y[j+1UL] -= x[i] * A(i,j+1UL);
1673 y[jpos] -= x[i] * A(i,jpos);
1695 template<
typename VT1
1698 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1699 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1701 selectDefaultSubAssignKernel( y, x, A );
1721 template<
typename VT1
1724 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1725 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1727 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1729 const size_t M( A.rows() );
1730 const size_t N( A.columns() );
1732 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1739 const size_t ibegin( ( IsLower_v<MT1> )
1740 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1742 const size_t iend( ( IsUpper_v<MT1> )
1743 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1756 for(
size_t i=ibegin; i<iend; ++i ) {
1758 xmm1 -= x1 * A.load(i,j );
1759 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1760 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1761 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1762 xmm5 -= x1 * A.load(i,j+
SIMDSIZE*4UL);
1763 xmm6 -= x1 * A.load(i,j+
SIMDSIZE*5UL);
1764 xmm7 -= x1 * A.load(i,j+
SIMDSIZE*6UL);
1765 xmm8 -= x1 * A.load(i,j+
SIMDSIZE*7UL);
1768 y.store( j , xmm1 );
1780 const size_t ibegin( ( IsLower_v<MT1> )
1781 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1783 const size_t iend( ( IsUpper_v<MT1> )
1784 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1793 for(
size_t i=ibegin; i<iend; ++i ) {
1795 xmm1 -= x1 * A.load(i,j );
1796 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1797 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1798 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1801 y.store( j , xmm1 );
1809 const size_t ibegin( ( IsLower_v<MT1> )
1810 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1812 const size_t iend( ( IsUpper_v<MT1> )
1813 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1821 for(
size_t i=ibegin; i<iend; ++i ) {
1823 xmm1 -= x1 * A.load(i,j );
1824 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1825 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1828 y.store( j , xmm1 );
1835 const size_t ibegin( ( IsLower_v<MT1> )
1836 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1838 const size_t iend( ( IsUpper_v<MT1> )
1839 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1846 for(
size_t i=ibegin; i<iend; ++i ) {
1848 xmm1 -= x1 * A.load(i,j );
1852 y.store( j , xmm1 );
1858 const size_t ibegin( ( IsLower_v<MT1> )
1859 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1861 const size_t iend( ( IsUpper_v<MT1> )
1862 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1868 for(
size_t i=ibegin; i<iend; ++i ) {
1869 xmm1 -=
set( x[i] ) * A.load(i,j);
1875 for( ; remainder && j<N; ++j )
1877 const size_t ibegin( ( IsLower_v<MT1> )
1878 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1880 const size_t iend( ( IsUpper_v<MT1> )
1881 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1887 for(
size_t i=ibegin; i<iend; ++i ) {
1888 value += x[i] * A(i,j);
1911 template<
typename VT1
1914 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1915 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1917 selectDefaultSubAssignKernel( y, x, A );
1937 template<
typename VT1
1940 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1941 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1943 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1945 const size_t M( A.rows() );
1946 const size_t N( A.columns() );
1948 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1949 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1953 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1954 for(
size_t ii=0UL; ii<M; ii+=iblock )
1956 const size_t iend(
min( ii+iblock, M ) );
1957 const size_t jtmp(
min( jj+jblock, N ) );
1958 const size_t jend( ( IsLower_v<MT1> )
1959 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1962 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1965 size_t j( ( IsUpper_v<MT1> )
1966 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
1971 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1973 for(
size_t i=ii; i<iend; ++i ) {
1975 xmm1 += x1 * A.load(i,j );
1976 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1977 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1978 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1979 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1980 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1981 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1982 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1985 y.store( j , y.load(j ) - xmm1 );
1999 for(
size_t i=ii; i<iend; ++i ) {
2001 xmm1 += x1 * A.load(i,j );
2002 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2003 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2004 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2007 y.store( j , y.load(j ) - xmm1 );
2017 for(
size_t i=ii; i<iend; ++i ) {
2019 xmm1 += x1 * A.load(i,j );
2020 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2021 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2024 y.store( j , y.load(j ) - xmm1 );
2033 for(
size_t i=ii; i<iend; ++i ) {
2035 xmm1 += x1 * A.load(i,j );
2039 y.store( j , y.load(j ) - xmm1 );
2047 for(
size_t i=ii; i<iend; ++i ) {
2048 xmm1 +=
set( x[i] ) * A.load(i,j);
2051 y.store( j, y.load(j) - xmm1 );
2054 for( ; remainder && j<jend; ++j )
2058 for(
size_t i=ii; i<iend; ++i ) {
2059 value += x[i] * A(i,j);
2084 template<
typename VT1
2087 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2088 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2090 selectLargeSubAssignKernel( y, x, A );
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2110 template<
typename VT1
2113 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2114 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2116 using ET = ElementType_t<VT1>;
2118 if( IsTriangular_v<MT1> ) {
2119 ResultType_t<VT1> tmp(
serial( x ) );
2120 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2121 subAssign( y, tmp );
2124 gemv( y, x, A, ET(-1), ET(1) );
2148 template<
typename VT1 >
2149 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2160 multAssign( ~lhs, tmp );
2182 template<
typename VT1 >
2183 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2194 divAssign( ~lhs, tmp );
2218 template<
typename VT1 >
2220 -> EnableIf_t< UseSMPAssign_v<VT1> >
2226 if( rhs.mat_.rows() == 0UL ) {
2230 else if( rhs.mat_.columns() == 0UL ) {
2262 template<
typename VT1 >
2264 -> EnableIf_t< UseSMPAssign_v<VT1> >
2295 template<
typename VT1 >
2297 -> EnableIf_t< UseSMPAssign_v<VT1> >
2303 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2339 template<
typename VT1 >
2341 -> EnableIf_t< UseSMPAssign_v<VT1> >
2347 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2383 template<
typename VT1 >
2385 -> EnableIf_t< UseSMPAssign_v<VT1> >
2420 template<
typename VT1 >
2422 -> EnableIf_t< UseSMPAssign_v<VT1> >
2471 template<
typename VT
2474 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2475 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2476 ,
private Computation
2480 using VMM = TDVecDMatMultExpr<VT,MT>;
2481 using RES = ResultType_t<VMM>;
2482 using VRT = ResultType_t<VT>;
2483 using MRT = ResultType_t<MT>;
2484 using VET = ElementType_t<VRT>;
2485 using MET = ElementType_t<MRT>;
2486 using VCT = CompositeType_t<VT>;
2487 using MCT = CompositeType_t<MT>;
2492 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2497 static constexpr
bool evaluateMatrix =
2498 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2499 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2507 template<
typename T1 >
2508 static constexpr
bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2515 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2516 static constexpr
bool UseBlasKernel_v =
2518 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2519 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2520 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2521 !IsDiagonal_v<T3> &&
2522 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2523 IsBLASCompatible_v< ElementType_t<T1> > &&
2524 IsBLASCompatible_v< ElementType_t<T2> > &&
2525 IsBLASCompatible_v< ElementType_t<T3> > &&
2526 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2527 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2528 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2536 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2537 static constexpr
bool UseVectorizedDefaultKernel_v =
2538 ( useOptimizedKernels &&
2539 !IsDiagonal_v<T3> &&
2540 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2541 IsSIMDCombinable_v< ElementType_t<T1>
2545 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2546 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2551 using This = DVecScalarMultExpr<VMM,ST,true>;
2552 using BaseType = DenseVector<This,true>;
2556 using SIMDType = SIMDTrait_t<ElementType>;
2561 using LeftOperand =
const TDVecDMatMultExpr<VT,MT>;
2567 using LT = If_t< evaluateVector, const VRT, VCT >;
2570 using RT = If_t< evaluateMatrix, const MRT, MCT >;
2576 ( !IsDiagonal_v<MT> &&
2577 VT::simdEnabled && MT::simdEnabled &&
2578 IsSIMDCombinable_v<VET,MET,ST> &&
2579 HasSIMDAdd_v<VET,MET> &&
2580 HasSIMDMult_v<VET,MET> );
2584 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2624 if( index >=
vector_.size() ) {
2627 return (*
this)[index];
2636 inline size_t size()
const {
2667 template<
typename T >
2668 inline bool canAlias(
const T* alias )
const {
2669 return vector_.canAlias( alias );
2679 template<
typename T >
2680 inline bool isAliased(
const T* alias )
const {
2681 return vector_.isAliased( alias );
2701 RightOperand_t<VMM> A(
vector_.rightOperand() );
2705 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2706 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2729 template<
typename VT1 >
2730 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2736 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2737 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2739 if( right.rows() == 0UL ) {
2743 else if( right.columns() == 0UL ) {
2755 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2770 template<
typename VT1
2774 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2776 if( ( IsDiagonal_v<MT1> ) ||
2777 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2778 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779 selectSmallAssignKernel( y, x, A, scalar );
2781 selectBlasAssignKernel( y, x, A, scalar );
2799 template<
typename VT1
2803 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2805 const size_t M( A.rows() );
2806 const size_t N( A.columns() );
2808 if( IsStrictlyUpper_v<MT1> ) {
2812 if( !IsLower_v<MT1> )
2814 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2815 y[j] = x[0UL] * A(0UL,j);
2819 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2821 if( IsDiagonal_v<MT1> )
2823 y[i] = x[i] * A(i,i) * scalar;
2827 const size_t jbegin( ( IsUpper_v<MT1> )
2828 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2830 const size_t jend( ( IsLower_v<MT1> )
2831 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2835 const size_t jnum( jend - jbegin );
2836 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2838 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2839 y[j ] += x[i] * A(i,j );
2840 y[j+1UL] += x[i] * A(i,j+1UL);
2843 y[jpos] += x[i] * A(i,jpos);
2845 if( IsLower_v<MT1> ) {
2846 y[jend] = x[i] * A(i,jend);
2851 if( IsStrictlyLower_v<MT1> ) {
2855 if( !IsDiagonal_v<MT1> )
2857 const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2858 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2879 template<
typename VT1
2883 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2884 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2886 selectDefaultAssignKernel( y, x, A, scalar );
2904 template<
typename VT1
2908 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2909 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2911 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2913 const size_t M( A.rows() );
2914 const size_t N( A.columns() );
2916 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
2919 const SIMDType factor(
set( scalar ) );
2925 const size_t ibegin( ( IsLower_v<MT1> )
2926 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2928 const size_t iend( ( IsUpper_v<MT1> )
2929 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2935 for(
size_t i=ibegin; i<iend; ++i ) {
2936 const SIMDType x1(
set( x[i] ) );
2937 xmm1 += x1 * A.load(i,j );
2938 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2939 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2940 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2941 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
2942 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
2943 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
2944 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
2947 y.store( j , xmm1*factor );
2948 y.store( j+
SIMDSIZE , xmm2*factor );
2949 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
2950 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
2951 y.store( j+
SIMDSIZE*4UL, xmm5*factor );
2952 y.store( j+
SIMDSIZE*5UL, xmm6*factor );
2953 y.store( j+
SIMDSIZE*6UL, xmm7*factor );
2954 y.store( j+
SIMDSIZE*7UL, xmm8*factor );
2959 const size_t ibegin( ( IsLower_v<MT1> )
2960 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2962 const size_t iend( ( IsUpper_v<MT1> )
2963 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2967 SIMDType xmm1, xmm2, xmm3, xmm4;
2969 for(
size_t i=ibegin; i<iend; ++i ) {
2970 const SIMDType x1(
set( x[i] ) );
2971 xmm1 += x1 * A.load(i,j );
2972 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2973 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2974 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2977 y.store( j , xmm1*factor );
2978 y.store( j+
SIMDSIZE , xmm2*factor );
2979 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
2980 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
2985 const size_t ibegin( ( IsLower_v<MT1> )
2986 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2988 const size_t iend( ( IsUpper_v<MT1> )
2989 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2993 SIMDType xmm1, xmm2, xmm3;
2995 for(
size_t i=ibegin; i<iend; ++i ) {
2996 const SIMDType x1(
set( x[i] ) );
2997 xmm1 += x1 * A.load(i,j );
2998 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2999 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3002 y.store( j , xmm1*factor );
3003 y.store( j+
SIMDSIZE , xmm2*factor );
3004 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
3009 const size_t ibegin( ( IsLower_v<MT1> )
3010 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3012 const size_t iend( ( IsUpper_v<MT1> )
3013 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3017 SIMDType xmm1, xmm2;
3019 for(
size_t i=ibegin; i<iend; ++i ) {
3020 const SIMDType x1(
set( x[i] ) );
3021 xmm1 += x1 * A.load(i,j );
3025 y.store( j , xmm1*factor );
3026 y.store( j+
SIMDSIZE, xmm2*factor );
3031 const size_t ibegin( ( IsLower_v<MT1> )
3032 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3034 const size_t iend( ( IsUpper_v<MT1> )
3035 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3041 for(
size_t i=ibegin; i<iend; ++i ) {
3042 xmm1 +=
set( x[i] ) * A.load(i,j);
3045 y.store( j, xmm1*factor );
3048 for( ; remainder && j<N; ++j )
3050 const size_t ibegin( ( IsLower_v<MT1> )
3051 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3053 const size_t iend( ( IsUpper_v<MT1> )
3054 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3060 for(
size_t i=ibegin; i<iend; ++i ) {
3061 value += x[i] * A(i,j);
3064 y[j] = value * scalar;
3083 template<
typename VT1
3087 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3088 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3090 selectDefaultAssignKernel( y, x, A, scalar );
3108 template<
typename VT1
3112 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3113 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3115 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3121 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3123 const SIMDType factor(
set( scalar ) );
3129 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3130 for(
size_t ii=0UL; ii<M; ii+=iblock )
3132 const size_t iend(
min( ii+iblock, M ) );
3133 const size_t jtmp(
min( jj+jblock, N ) );
3134 const size_t jend( ( IsLower_v<MT1> )
3135 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3138 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3141 size_t j( ( IsUpper_v<MT1> )
3142 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t i=ii; i<iend; ++i ) {
3150 const SIMDType x1(
set( x[i] ) );
3151 xmm1 += x1 * A.load(i,j );
3152 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3153 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3154 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3155 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3156 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3157 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3158 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3161 y.store( j , y.load(j ) + xmm1*factor );
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t i=ii; i<iend; ++i ) {
3176 const SIMDType x1(
set( x[i] ) );
3177 xmm1 += x1 * A.load(i,j );
3178 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3179 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3180 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3183 y.store( j , y.load(j ) + xmm1*factor );
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t i=ii; i<iend; ++i ) {
3194 const SIMDType x1(
set( x[i] ) );
3195 xmm1 += x1 * A.load(i,j );
3196 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3197 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3200 y.store( j , y.load(j ) + xmm1*factor );
3207 SIMDType xmm1, xmm2;
3209 for(
size_t i=ii; i<iend; ++i ) {
3210 const SIMDType x1(
set( x[i] ) );
3211 xmm1 += x1 * A.load(i,j );
3215 y.store( j , y.load(j ) + xmm1*factor );
3223 for(
size_t i=ii; i<iend; ++i ) {
3224 xmm1 +=
set( x[i] ) * A.load(i,j);
3227 y.store( j, y.load(j) + xmm1*factor );
3230 for( ; remainder && j<jend; ++j )
3234 for(
size_t i=ii; i<iend; ++i ) {
3235 value += x[i] * A(i,j);
3238 y[j] += value * scalar;
3258 template<
typename VT1
3262 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3263 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3265 selectLargeAssignKernel( y, x, A, scalar );
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3284 template<
typename VT1
3288 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3289 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3291 using ET = ElementType_t<VT1>;
3293 if( IsTriangular_v<MT1> ) {
3294 assign( y, scalar * x );
3295 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3298 gemv( y, x, A,
ET(scalar),
ET(0) );
3316 template<
typename VT1 >
3317 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3328 assign( ~lhs, tmp );
3344 template<
typename VT1 >
3345 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3351 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3352 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3354 if( right.rows() == 0UL || right.columns() == 0UL ) {
3366 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3381 template<
typename VT1
3385 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3387 if( ( IsDiagonal_v<MT1> ) ||
3388 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3389 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390 selectSmallAddAssignKernel( y, x, A, scalar );
3392 selectBlasAddAssignKernel( y, x, A, scalar );
3410 template<
typename VT1
3414 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3416 y.addAssign( x * A * scalar );
3434 template<
typename VT1
3438 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3439 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3441 selectDefaultAddAssignKernel( y, x, A, scalar );
3460 template<
typename VT1
3464 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3465 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3467 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3469 const size_t M( A.rows() );
3470 const size_t N( A.columns() );
3472 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3475 const SIMDType factor(
set( scalar ) );
3481 const size_t ibegin( ( IsLower_v<MT1> )
3482 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3484 const size_t iend( ( IsUpper_v<MT1> )
3485 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3489 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3491 for(
size_t i=ibegin; i<iend; ++i ) {
3492 const SIMDType x1(
set( x[i] ) );
3493 xmm1 += x1 * A.load(i,j );
3494 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3495 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3496 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3497 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3498 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3499 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3500 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3503 y.store( j , y.load(j ) + xmm1*factor );
3515 const size_t ibegin( ( IsLower_v<MT1> )
3516 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3518 const size_t iend( ( IsUpper_v<MT1> )
3519 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3523 SIMDType xmm1, xmm2, xmm3, xmm4;
3525 for(
size_t i=ibegin; i<iend; ++i ) {
3526 const SIMDType x1(
set( x[i] ) );
3527 xmm1 += x1 * A.load(i,j );
3528 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3529 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3530 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3533 y.store( j , y.load(j ) + xmm1*factor );
3541 const size_t ibegin( ( IsLower_v<MT1> )
3542 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3544 const size_t iend( ( IsUpper_v<MT1> )
3545 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3549 SIMDType xmm1, xmm2, xmm3;
3551 for(
size_t i=ibegin; i<iend; ++i ) {
3552 const SIMDType x1(
set( x[i] ) );
3553 xmm1 += x1 * A.load(i,j );
3554 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3555 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3558 y.store( j , y.load(j ) + xmm1*factor );
3565 const size_t ibegin( ( IsLower_v<MT1> )
3566 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3568 const size_t iend( ( IsUpper_v<MT1> )
3569 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3573 SIMDType xmm1, xmm2;
3575 for(
size_t i=ibegin; i<iend; ++i ) {
3576 const SIMDType x1(
set( x[i] ) );
3577 xmm1 += x1 * A.load(i,j );
3581 y.store( j , y.load(j ) + xmm1*factor );
3587 const size_t ibegin( ( IsLower_v<MT1> )
3588 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3590 const size_t iend( ( IsUpper_v<MT1> )
3591 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3597 for(
size_t i=ibegin; i<iend; ++i ) {
3598 xmm1 +=
set( x[i] ) * A.load(i,j);
3601 y.store( j, y.load(j) + xmm1*factor );
3604 for( ; remainder && j<N; ++j )
3606 const size_t ibegin( ( IsLower_v<MT1> )
3607 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3609 const size_t iend( ( IsUpper_v<MT1> )
3610 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3616 for(
size_t i=ibegin; i<iend; ++i ) {
3617 value += x[i] * A(i,j);
3620 y[j] += value * scalar;
3639 template<
typename VT1
3643 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3644 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3646 selectDefaultAddAssignKernel( y, x, A, scalar );
3665 template<
typename VT1
3669 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3670 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3672 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3674 const size_t M( A.rows() );
3675 const size_t N( A.columns() );
3677 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3678 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3680 const SIMDType factor(
set( scalar ) );
3684 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3685 for(
size_t ii=0UL; ii<M; ii+=iblock )
3687 const size_t iend(
min( ii+iblock, M ) );
3688 const size_t jtmp(
min( jj+jblock, N ) );
3689 const size_t jend( ( IsLower_v<MT1> )
3690 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3693 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3696 size_t j( ( IsUpper_v<MT1> )
3697 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
3702 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3704 for(
size_t i=ii; i<iend; ++i ) {
3705 const SIMDType x1(
set( x[i] ) );
3706 xmm1 += x1 * A.load(i,j );
3707 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3708 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3709 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3710 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3711 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3712 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3713 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3716 y.store( j , y.load(j ) + xmm1*factor );
3728 SIMDType xmm1, xmm2, xmm3, xmm4;
3730 for(
size_t i=ii; i<iend; ++i ) {
3731 const SIMDType x1(
set( x[i] ) );
3732 xmm1 += x1 * A.load(i,j );
3733 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3734 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3735 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3738 y.store( j , y.load(j ) + xmm1*factor );
3746 SIMDType xmm1, xmm2, xmm3;
3748 for(
size_t i=ii; i<iend; ++i ) {
3749 const SIMDType x1(
set( x[i] ) );
3750 xmm1 += x1 * A.load(i,j );
3751 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3752 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3755 y.store( j , y.load(j ) + xmm1*factor );
3762 SIMDType xmm1, xmm2;
3764 for(
size_t i=ii; i<iend; ++i ) {
3765 const SIMDType x1(
set( x[i] ) );
3766 xmm1 += x1 * A.load(i,j );
3770 y.store( j , y.load(j ) + xmm1*factor );
3778 for(
size_t i=ii; i<iend; ++i ) {
3779 xmm1 +=
set( x[i] ) * A.load(i,j);
3782 y.store( j, y.load(j) + xmm1*factor );
3785 for( ; remainder && j<jend; ++j )
3789 for(
size_t i=ii; i<iend; ++i ) {
3790 value += x[i] * A(i,j);
3793 y[j] += value * scalar;
3814 template<
typename VT1
3818 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3819 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3821 selectLargeAddAssignKernel( y, x, A, scalar );
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3840 template<
typename VT1
3844 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3845 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3847 using ET = ElementType_t<VT1>;
3849 if( IsTriangular_v<MT1> ) {
3850 ResultType_t<VT1> tmp(
serial( scalar * x ) );
3851 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3852 addAssign( y, tmp );
3855 gemv( y, x, A,
ET(scalar),
ET(1) );
3877 template<
typename VT1 >
3878 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3884 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3885 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3887 if( right.rows() == 0UL || right.columns() == 0UL ) {
3899 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3914 template<
typename VT1
3918 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3920 if( ( IsDiagonal_v<MT1> ) ||
3921 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3922 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923 selectSmallSubAssignKernel( y, x, A, scalar );
3925 selectBlasSubAssignKernel( y, x, A, scalar );
3943 template<
typename VT1
3947 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3949 y.subAssign( x * A * scalar );
3967 template<
typename VT1
3971 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3972 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3974 selectDefaultSubAssignKernel( y, x, A, scalar );
3993 template<
typename VT1
3997 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3998 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4000 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4002 const size_t M( A.rows() );
4003 const size_t N( A.columns() );
4005 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
4008 const SIMDType factor(
set( scalar ) );
4014 const size_t ibegin( ( IsLower_v<MT1> )
4015 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4017 const size_t iend( ( IsUpper_v<MT1> )
4018 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4022 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4024 for(
size_t i=ibegin; i<iend; ++i ) {
4025 const SIMDType x1(
set( x[i] ) );
4026 xmm1 += x1 * A.load(i,j );
4027 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4028 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4029 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4030 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4031 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4032 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4033 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4036 y.store( j , y.load(j ) - xmm1*factor );
4048 const size_t ibegin( ( IsLower_v<MT1> )
4049 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4051 const size_t iend( ( IsUpper_v<MT1> )
4052 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4056 SIMDType xmm1, xmm2, xmm3, xmm4;
4058 for(
size_t i=ibegin; i<iend; ++i ) {
4059 const SIMDType x1(
set( x[i] ) );
4060 xmm1 += x1 * A.load(i,j );
4061 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4062 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4063 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4066 y.store( j , y.load(j ) - xmm1*factor );
4074 const size_t ibegin( ( IsLower_v<MT1> )
4075 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4077 const size_t iend( ( IsUpper_v<MT1> )
4078 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4082 SIMDType xmm1, xmm2, xmm3;
4084 for(
size_t i=ibegin; i<iend; ++i ) {
4085 const SIMDType x1(
set( x[i] ) );
4086 xmm1 += x1 * A.load(i,j );
4087 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4088 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4091 y.store( j , y.load(j ) - xmm1*factor );
4098 const size_t ibegin( ( IsLower_v<MT1> )
4099 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4101 const size_t iend( ( IsUpper_v<MT1> )
4102 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4106 SIMDType xmm1, xmm2;
4108 for(
size_t i=ibegin; i<iend; ++i ) {
4109 const SIMDType x1(
set( x[i] ) );
4110 xmm1 += x1 * A.load(i,j );
4114 y.store( j , y.load(j ) - xmm1*factor );
4120 const size_t ibegin( ( IsLower_v<MT1> )
4121 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4123 const size_t iend( ( IsUpper_v<MT1> )
4124 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4130 for(
size_t i=ibegin; i<iend; ++i ) {
4131 xmm1 +=
set( x[i] ) * A.load(i,j);
4134 y.store( j, y.load(j) - xmm1*factor );
4137 for( ; remainder && j<N; ++j )
4139 const size_t ibegin( ( IsLower_v<MT1> )
4140 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4142 const size_t iend( ( IsUpper_v<MT1> )
4143 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4149 for(
size_t i=ibegin; i<iend; ++i ) {
4150 value += x[i] * A(i,j);
4153 y[j] -= value * scalar;
4172 template<
typename VT1
4176 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4177 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4179 selectDefaultSubAssignKernel( y, x, A, scalar );
4198 template<
typename VT1
4202 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4203 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4205 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4207 const size_t M( A.rows() );
4208 const size_t N( A.columns() );
4210 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
4211 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4213 const SIMDType factor(
set( scalar ) );
4217 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4218 for(
size_t ii=0UL; ii<M; ii+=iblock )
4220 const size_t iend(
min( ii+iblock, M ) );
4221 const size_t jtmp(
min( jj+jblock, N ) );
4222 const size_t jend( ( IsLower_v<MT1> )
4223 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4226 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4229 size_t j( ( IsUpper_v<MT1> )
4230 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
4235 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4237 for(
size_t i=ii; i<iend; ++i ) {
4238 const SIMDType x1(
set( x[i] ) );
4239 xmm1 += x1 * A.load(i,j );
4240 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4241 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4242 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4243 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4244 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4245 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4246 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4249 y.store( j , y.load(j ) - xmm1*factor );
4261 SIMDType xmm1, xmm2, xmm3, xmm4;
4263 for(
size_t i=ii; i<iend; ++i ) {
4264 const SIMDType x1(
set( x[i] ) );
4265 xmm1 += x1 * A.load(i,j );
4266 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4267 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4268 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4271 y.store( j , y.load(j ) - xmm1*factor );
4279 SIMDType xmm1, xmm2, xmm3;
4281 for(
size_t i=ii; i<iend; ++i ) {
4282 const SIMDType x1(
set( x[i] ) );
4283 xmm1 += x1 * A.load(i,j );
4284 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4285 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4288 y.store( j , y.load(j ) - xmm1*factor );
4295 SIMDType xmm1, xmm2;
4297 for(
size_t i=ii; i<iend; ++i ) {
4298 const SIMDType x1(
set( x[i] ) );
4299 xmm1 += x1 * A.load(i,j );
4303 y.store( j , y.load(j ) - xmm1*factor );
4311 for(
size_t i=ii; i<iend; ++i ) {
4312 xmm1 +=
set( x[i] ) * A.load(i,j);
4315 y.store( j, y.load(j) - xmm1*factor );
4318 for( ; remainder && j<jend; ++j )
4322 for(
size_t i=ii; i<iend; ++i ) {
4323 value += x[i] * A(i,j);
4326 y[j] -= value * scalar;
4347 template<
typename VT1
4351 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4352 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4354 selectLargeSubAssignKernel( y, x, A, scalar );
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4373 template<
typename VT1
4377 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4378 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4380 using ET = ElementType_t<VT1>;
4382 if( IsTriangular_v<MT1> ) {
4383 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4384 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4385 subAssign( y, tmp );
4388 gemv( y, x, A,
ET(-scalar),
ET(1) );
4410 template<
typename VT1 >
4411 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4422 multAssign( ~lhs, tmp );
4442 template<
typename VT1 >
4443 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4454 divAssign( ~lhs, tmp );
4476 template<
typename VT1 >
4478 -> EnableIf_t< UseSMPAssign_v<VT1> >
4484 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4485 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4487 if( right.rows() == 0UL ) {
4491 else if( right.columns() == 0UL ) {
4521 template<
typename VT1 >
4523 -> EnableIf_t< UseSMPAssign_v<VT1> >
4552 template<
typename VT1 >
4554 -> EnableIf_t< UseSMPAssign_v<VT1> >
4560 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4561 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4563 if( right.rows() == 0UL || right.columns() == 0UL ) {
4597 template<
typename VT1 >
4599 -> EnableIf_t< UseSMPAssign_v<VT1> >
4605 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4606 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4608 if( right.rows() == 0UL || right.columns() == 0UL ) {
4643 template<
typename VT1 >
4645 -> EnableIf_t< UseSMPAssign_v<VT1> >
4678 template<
typename VT1 >
4680 -> EnableIf_t< UseSMPAssign_v<VT1> >
4753 template<
typename VT
4755 inline decltype(
auto)
4762 if( (~vec).
size() != (~mat).
rows() ) {
4767 return ReturnType( ~vec, ~mat );
4794 template<
typename VT
4796 inline decltype(
auto)
4797 operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4801 return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4817 template<
typename VT,
typename MT >
4818 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4819 :
public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:215
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:204
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:568
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:524
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:164
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:163
Header file for basic type definitions.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:534
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:167
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:429
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:296
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:514
Header file for the DenseVector base class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:363
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:170
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
Header file for the Computation base class.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:233
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:329
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsFloat type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:353
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:434
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:226
Header file for the IsComplexDouble type trait.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:161
Headerfile for the generic max algorithm.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:468
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:133
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1162
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:218
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:108
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1198
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:173
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:558
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:586
System settings for the BLAS mode.
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:131
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:455
Header file for the IsSIMDCombinable type trait.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:206
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:578
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:373
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:239
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:144
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:175
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:139
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:207
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:309
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:341
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type,...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:546
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:212
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:443
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:319
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
Header file for the IsUpper type trait.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:221
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:134
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:130
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:176
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:424