35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_ 121 template<
typename VT
123 class TDVecDMatMultExpr
124 :
public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
125 ,
private Computation
139 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
145 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
155 template<
typename T1 >
165 template<
typename T1,
typename T2,
typename T3 >
166 static constexpr
bool UseBlasKernel_v =
168 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173 IsBLASCompatible_v< ElementType_t<T1> > &&
174 IsBLASCompatible_v< ElementType_t<T2> > &&
175 IsBLASCompatible_v< ElementType_t<T3> > &&
187 template<
typename T1,
typename T2,
typename T3 >
188 static constexpr
bool UseVectorizedDefaultKernel_v =
189 ( useOptimizedKernels &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192 IsSIMDCombinable_v< ElementType_t<T1>
227 ( !IsDiagonal_v<MT> &&
228 VT::simdEnabled && MT::simdEnabled &&
229 HasSIMDAdd_v<VET,MET> &&
230 HasSIMDMult_v<VET,MET> );
265 if( IsDiagonal_v<MT> )
267 return vec_[index] *
mat_(index,index);
269 else if( IsLower_v<MT> && ( index > 8UL ) )
271 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
276 else if( IsUpper_v<MT> && ( index + 8UL <
mat_.rows() ) )
278 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
297 if( index >=
mat_.columns() ) {
300 return (*
this)[index];
309 inline size_t size() const noexcept {
310 return mat_.columns();
340 template<
typename T >
341 inline bool canAlias(
const T* alias )
const noexcept {
342 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
352 template<
typename T >
353 inline bool isAliased(
const T* alias )
const noexcept {
354 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
364 return vec_.isAligned() &&
mat_.isAligned();
378 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
402 template<
typename VT1 >
409 if( rhs.mat_.rows() == 0UL ) {
413 else if( rhs.mat_.columns() == 0UL ) {
425 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
441 template<
typename VT1
444 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
446 if( ( IsDiagonal_v<MT1> ) ||
448 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449 selectSmallAssignKernel( y, x, A );
451 selectBlasAssignKernel( y, x, A );
470 template<
typename VT1
473 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
475 const size_t M( A.rows() );
476 const size_t N( A.columns() );
478 if( IsStrictlyUpper_v<MT1> ) {
482 if( !IsLower_v<MT1> )
484 const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
485 for(
size_t j=jbegin; j<N; ++j ) {
486 y[j] = x[0UL] * A(0UL,j);
490 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
492 if( IsDiagonal_v<MT1> )
494 y[i] = x[i] * A(i,i);
498 const size_t jbegin( ( IsUpper_v<MT1> )
499 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
501 const size_t jend( ( IsLower_v<MT1> )
502 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
506 const size_t jnum( jend - jbegin );
507 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
509 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
510 y[j ] += x[i] * A(i,j );
511 y[j+1UL] += x[i] * A(i,j+1UL);
514 y[jpos] += x[i] * A(i,jpos);
516 if( IsLower_v<MT1> ) {
517 y[jend] = x[i] * A(i,jend);
522 if( IsStrictlyLower_v<MT1> ) {
543 template<
typename VT1
546 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
547 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
549 selectDefaultAssignKernel( y, x, A );
568 template<
typename VT1
571 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
572 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
574 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
576 const size_t M( A.rows() );
577 const size_t N( A.columns() );
579 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
586 const size_t ibegin( ( IsLower_v<MT1> )
587 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
589 const size_t iend( ( IsUpper_v<MT1> )
590 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
596 for(
size_t i=ibegin; i<iend; ++i ) {
598 xmm1 += x1 * A.load(i,j );
600 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
601 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
602 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
603 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
604 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
605 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
620 const size_t ibegin( ( IsLower_v<MT1> )
621 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
623 const size_t iend( ( IsUpper_v<MT1> )
624 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
630 for(
size_t i=ibegin; i<iend; ++i ) {
632 xmm1 += x1 * A.load(i,j );
634 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
635 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
646 const size_t ibegin( ( IsLower_v<MT1> )
647 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
649 const size_t iend( ( IsUpper_v<MT1> )
650 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
656 for(
size_t i=ibegin; i<iend; ++i ) {
658 xmm1 += x1 * A.load(i,j );
660 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
670 const size_t ibegin( ( IsLower_v<MT1> )
671 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
673 const size_t iend( ( IsUpper_v<MT1> )
674 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
680 for(
size_t i=ibegin; i<iend; ++i ) {
682 xmm1 += x1 * A.load(i,j );
692 const size_t ibegin( ( IsLower_v<MT1> )
693 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
695 const size_t iend( ( IsUpper_v<MT1> )
696 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
702 for(
size_t i=ibegin; i<iend; ++i ) {
703 xmm1 +=
set( x[i] ) * A.load(i,j);
709 for( ; remainder && j<N; ++j )
711 const size_t ibegin( ( IsLower_v<MT1> )
712 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
714 const size_t iend( ( IsUpper_v<MT1> )
715 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
721 for(
size_t i=ibegin; i<iend; ++i ) {
722 value += x[i] * A(i,j);
745 template<
typename VT1
748 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
749 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
751 selectDefaultAssignKernel( y, x, A );
770 template<
typename VT1
773 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
774 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
776 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
778 const size_t M( A.rows() );
779 const size_t N( A.columns() );
781 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
782 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
788 for(
size_t jj=0U; jj<N; jj+=jblock ) {
789 for(
size_t ii=0UL; ii<M; ii+=iblock )
791 const size_t iend(
min( ii+iblock, M ) );
792 const size_t jtmp(
min( jj+jblock, N ) );
793 const size_t jend( ( IsLower_v<MT1> )
794 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
797 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
800 size_t j( ( IsUpper_v<MT1> )
801 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
806 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
808 for(
size_t i=ii; i<iend; ++i ) {
810 xmm1 += x1 * A.load(i,j );
812 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
813 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
814 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
815 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
816 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
817 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
820 y.store( j , y.load(j ) + xmm1 );
834 for(
size_t i=ii; i<iend; ++i ) {
836 xmm1 += x1 * A.load(i,j );
838 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
839 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
842 y.store( j , y.load(j ) + xmm1 );
852 for(
size_t i=ii; i<iend; ++i ) {
854 xmm1 += x1 * A.load(i,j );
856 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
859 y.store( j , y.load(j ) + xmm1 );
868 for(
size_t i=ii; i<iend; ++i ) {
870 xmm1 += x1 * A.load(i,j );
874 y.store( j , y.load(j ) + xmm1 );
882 for(
size_t i=ii; i<iend; ++i ) {
883 xmm1 +=
set( x[i] ) * A.load(i,j);
886 y.store( j, y.load(j) + xmm1 );
889 for( ; remainder && j<jend; ++j )
893 for(
size_t i=ii; i<iend; ++i ) {
894 value += x[i] * A(i,j);
919 template<
typename VT1
922 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
923 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
925 selectLargeAssignKernel( y, x, A );
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 945 template<
typename VT1
948 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
949 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
951 using ET = ElementType_t<VT1>;
953 if( IsTriangular_v<MT1> ) {
955 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
958 gemv( y, x, A, ET(1), ET(0) );
978 template<
typename VT1 >
979 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1008 template<
typename VT1 >
1009 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1015 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1027 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1043 template<
typename VT1
1046 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1048 if( ( IsDiagonal_v<MT1> ) ||
1050 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051 selectSmallAddAssignKernel( y, x, A );
1053 selectBlasAddAssignKernel( y, x, A );
1072 template<
typename VT1
1075 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1077 const size_t M( A.rows() );
1078 const size_t N( A.columns() );
1080 for(
size_t i=0UL; i<M; ++i )
1082 if( IsDiagonal_v<MT1> )
1084 y[i] += x[i] * A(i,i);
1088 const size_t jbegin( ( IsUpper_v<MT1> )
1089 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1091 const size_t jend( ( IsLower_v<MT1> )
1092 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1096 const size_t jnum( jend - jbegin );
1097 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1099 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1100 y[j ] += x[i] * A(i,j );
1101 y[j+1UL] += x[i] * A(i,j+1UL);
1104 y[jpos] += x[i] * A(i,jpos);
1126 template<
typename VT1
1129 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1130 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1132 selectDefaultAddAssignKernel( y, x, A );
1151 template<
typename VT1
1154 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1155 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1157 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1159 const size_t M( A.rows() );
1160 const size_t N( A.columns() );
1162 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1169 const size_t ibegin( ( IsLower_v<MT1> )
1170 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1172 const size_t iend( ( IsUpper_v<MT1> )
1173 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1186 for(
size_t i=ibegin; i<iend; ++i ) {
1188 xmm1 += x1 * A.load(i,j );
1189 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1190 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1191 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1192 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1193 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1194 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1195 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1198 y.store( j , xmm1 );
1210 const size_t ibegin( ( IsLower_v<MT1> )
1211 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1213 const size_t iend( ( IsUpper_v<MT1> )
1214 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1223 for(
size_t i=ibegin; i<iend; ++i ) {
1225 xmm1 += x1 * A.load(i,j );
1226 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1227 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1228 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1231 y.store( j , xmm1 );
1239 const size_t ibegin( ( IsLower_v<MT1> )
1240 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1242 const size_t iend( ( IsUpper_v<MT1> )
1243 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1251 for(
size_t i=ibegin; i<iend; ++i ) {
1253 xmm1 += x1 * A.load(i,j );
1254 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1255 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1258 y.store( j , xmm1 );
1265 const size_t ibegin( ( IsLower_v<MT1> )
1266 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1268 const size_t iend( ( IsUpper_v<MT1> )
1269 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1276 for(
size_t i=ibegin; i<iend; ++i ) {
1278 xmm1 += x1 * A.load(i,j );
1282 y.store( j , xmm1 );
1288 const size_t ibegin( ( IsLower_v<MT1> )
1289 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1291 const size_t iend( ( IsUpper_v<MT1> )
1292 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1298 for(
size_t i=ibegin; i<iend; ++i ) {
1299 xmm1 +=
set( x[i] ) * A.load(i,j);
1305 for( ; remainder && j<N; ++j )
1307 const size_t ibegin( ( IsLower_v<MT1> )
1308 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1310 const size_t iend( ( IsUpper_v<MT1> )
1311 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1317 for(
size_t i=ibegin; i<iend; ++i ) {
1318 value += x[i] * A(i,j);
1341 template<
typename VT1
1344 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1345 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1347 selectDefaultAddAssignKernel( y, x, A );
1366 template<
typename VT1
1369 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1370 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1372 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1377 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1378 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1382 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1383 for(
size_t ii=0UL; ii<M; ii+=iblock )
1385 const size_t iend(
min( ii+iblock, M ) );
1386 const size_t jtmp(
min( jj+jblock, N ) );
1387 const size_t jend( ( IsLower_v<MT1> )
1388 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1391 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1394 size_t j( ( IsUpper_v<MT1> )
1395 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1402 for(
size_t i=ii; i<iend; ++i ) {
1404 xmm1 += x1 * A.load(i,j );
1405 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1406 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1407 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1408 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1409 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1410 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1411 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1414 y.store( j , y.load(j ) + xmm1 );
1428 for(
size_t i=ii; i<iend; ++i ) {
1430 xmm1 += x1 * A.load(i,j );
1431 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1432 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1433 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1436 y.store( j , y.load(j ) + xmm1 );
1446 for(
size_t i=ii; i<iend; ++i ) {
1448 xmm1 += x1 * A.load(i,j );
1449 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1450 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1453 y.store( j , y.load(j ) + xmm1 );
1462 for(
size_t i=ii; i<iend; ++i ) {
1464 xmm1 += x1 * A.load(i,j );
1468 y.store( j , y.load(j ) + xmm1 );
1476 for(
size_t i=ii; i<iend; ++i ) {
1477 xmm1 +=
set( x[i] ) * A.load(i,j);
1480 y.store( j, y.load(j) + xmm1 );
1483 for( ; remainder && j<jend; ++j )
1487 for(
size_t i=ii; i<iend; ++i ) {
1488 value += x[i] * A(i,j);
1513 template<
typename VT1
1516 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1517 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1519 selectLargeAddAssignKernel( y, x, A );
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1539 template<
typename VT1
1542 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1543 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1545 using ET = ElementType_t<VT1>;
1547 if( IsTriangular_v<MT1> ) {
1548 ResultType_t<VT1> tmp(
serial( x ) );
1549 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1550 addAssign( y, tmp );
1553 gemv( y, x, A, ET(1), ET(1) );
1577 template<
typename VT1 >
1578 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1584 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1596 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1612 template<
typename VT1
1615 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1617 if( ( IsDiagonal_v<MT1> ) ||
1619 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620 selectSmallSubAssignKernel( y, x, A );
1622 selectBlasSubAssignKernel( y, x, A );
1641 template<
typename VT1
1644 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1646 const size_t M( A.rows() );
1647 const size_t N( A.columns() );
1649 for(
size_t i=0UL; i<M; ++i )
1651 if( IsDiagonal_v<MT1> )
1653 y[i] -= x[i] * A(i,i);
1657 const size_t jbegin( ( IsUpper_v<MT1> )
1658 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1660 const size_t jend( ( IsLower_v<MT1> )
1661 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1665 const size_t jnum( jend - jbegin );
1666 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1668 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1669 y[j ] -= x[i] * A(i,j );
1670 y[j+1UL] -= x[i] * A(i,j+1UL);
1673 y[jpos] -= x[i] * A(i,jpos);
1695 template<
typename VT1
1698 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1699 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1701 selectDefaultSubAssignKernel( y, x, A );
1721 template<
typename VT1
1724 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1725 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1727 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1729 const size_t M( A.rows() );
1730 const size_t N( A.columns() );
1732 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
1739 const size_t ibegin( ( IsLower_v<MT1> )
1740 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1742 const size_t iend( ( IsUpper_v<MT1> )
1743 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1756 for(
size_t i=ibegin; i<iend; ++i ) {
1758 xmm1 -= x1 * A.load(i,j );
1759 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1760 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1761 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1762 xmm5 -= x1 * A.load(i,j+
SIMDSIZE*4UL);
1763 xmm6 -= x1 * A.load(i,j+
SIMDSIZE*5UL);
1764 xmm7 -= x1 * A.load(i,j+
SIMDSIZE*6UL);
1765 xmm8 -= x1 * A.load(i,j+
SIMDSIZE*7UL);
1768 y.store( j , xmm1 );
1780 const size_t ibegin( ( IsLower_v<MT1> )
1781 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1783 const size_t iend( ( IsUpper_v<MT1> )
1784 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1793 for(
size_t i=ibegin; i<iend; ++i ) {
1795 xmm1 -= x1 * A.load(i,j );
1796 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1797 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1798 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1801 y.store( j , xmm1 );
1809 const size_t ibegin( ( IsLower_v<MT1> )
1810 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1812 const size_t iend( ( IsUpper_v<MT1> )
1813 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1821 for(
size_t i=ibegin; i<iend; ++i ) {
1823 xmm1 -= x1 * A.load(i,j );
1824 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1825 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1828 y.store( j , xmm1 );
1835 const size_t ibegin( ( IsLower_v<MT1> )
1836 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1838 const size_t iend( ( IsUpper_v<MT1> )
1839 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1846 for(
size_t i=ibegin; i<iend; ++i ) {
1848 xmm1 -= x1 * A.load(i,j );
1852 y.store( j , xmm1 );
1858 const size_t ibegin( ( IsLower_v<MT1> )
1859 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1861 const size_t iend( ( IsUpper_v<MT1> )
1862 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1868 for(
size_t i=ibegin; i<iend; ++i ) {
1869 xmm1 -=
set( x[i] ) * A.load(i,j);
1875 for( ; remainder && j<N; ++j )
1877 const size_t ibegin( ( IsLower_v<MT1> )
1878 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1880 const size_t iend( ( IsUpper_v<MT1> )
1881 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1887 for(
size_t i=ibegin; i<iend; ++i ) {
1888 value += x[i] * A(i,j);
1911 template<
typename VT1
1914 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1915 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1917 selectDefaultSubAssignKernel( y, x, A );
1937 template<
typename VT1
1940 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1941 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1943 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1945 const size_t M( A.rows() );
1946 const size_t N( A.columns() );
1948 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1949 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1953 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1954 for(
size_t ii=0UL; ii<M; ii+=iblock )
1956 const size_t iend(
min( ii+iblock, M ) );
1957 const size_t jtmp(
min( jj+jblock, N ) );
1958 const size_t jend( ( IsLower_v<MT1> )
1959 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1962 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1965 size_t j( ( IsUpper_v<MT1> )
1966 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
1971 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1973 for(
size_t i=ii; i<iend; ++i ) {
1975 xmm1 += x1 * A.load(i,j );
1976 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1977 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1978 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1979 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1980 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1981 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1982 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1985 y.store( j , y.load(j ) - xmm1 );
1999 for(
size_t i=ii; i<iend; ++i ) {
2001 xmm1 += x1 * A.load(i,j );
2002 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2003 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2004 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2007 y.store( j , y.load(j ) - xmm1 );
2017 for(
size_t i=ii; i<iend; ++i ) {
2019 xmm1 += x1 * A.load(i,j );
2020 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2021 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2024 y.store( j , y.load(j ) - xmm1 );
2033 for(
size_t i=ii; i<iend; ++i ) {
2035 xmm1 += x1 * A.load(i,j );
2039 y.store( j , y.load(j ) - xmm1 );
2047 for(
size_t i=ii; i<iend; ++i ) {
2048 xmm1 +=
set( x[i] ) * A.load(i,j);
2051 y.store( j, y.load(j) - xmm1 );
2054 for( ; remainder && j<jend; ++j )
2058 for(
size_t i=ii; i<iend; ++i ) {
2059 value += x[i] * A(i,j);
2084 template<
typename VT1
2087 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2088 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2090 selectLargeSubAssignKernel( y, x, A );
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2110 template<
typename VT1
2113 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2114 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2116 using ET = ElementType_t<VT1>;
2118 if( IsTriangular_v<MT1> ) {
2119 ResultType_t<VT1> tmp(
serial( x ) );
2120 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2121 subAssign( y, tmp );
2124 gemv( y, x, A, ET(-1), ET(1) );
2148 template<
typename VT1 >
2149 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2160 multAssign( ~lhs, tmp );
2182 template<
typename VT1 >
2183 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2194 divAssign( ~lhs, tmp );
2218 template<
typename VT1 >
2220 -> EnableIf_t< UseSMPAssign_v<VT1> >
2226 if( rhs.mat_.rows() == 0UL ) {
2230 else if( rhs.mat_.columns() == 0UL ) {
2262 template<
typename VT1 >
2264 -> EnableIf_t< UseSMPAssign_v<VT1> >
2295 template<
typename VT1 >
2297 -> EnableIf_t< UseSMPAssign_v<VT1> >
2303 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2339 template<
typename VT1 >
2341 -> EnableIf_t< UseSMPAssign_v<VT1> >
2347 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2383 template<
typename VT1 >
2385 -> EnableIf_t< UseSMPAssign_v<VT1> >
2420 template<
typename VT1 >
2422 -> EnableIf_t< UseSMPAssign_v<VT1> >
2471 template<
typename VT
2474 class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2475 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2476 ,
private Computation
2480 using VMM = TDVecDMatMultExpr<VT,MT>;
2481 using RES = ResultType_t<VMM>;
2482 using VRT = ResultType_t<VT>;
2483 using MRT = ResultType_t<MT>;
2484 using VET = ElementType_t<VRT>;
2485 using MET = ElementType_t<MRT>;
2486 using VCT = CompositeType_t<VT>;
2487 using MCT = CompositeType_t<MT>;
2492 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2497 static constexpr
bool evaluateMatrix =
2498 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2499 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2507 template<
typename T1 >
2508 static constexpr
bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2515 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2516 static constexpr
bool UseBlasKernel_v =
2518 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2519 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2520 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2521 !IsDiagonal_v<T3> &&
2522 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2523 IsBLASCompatible_v< ElementType_t<T1> > &&
2524 IsBLASCompatible_v< ElementType_t<T2> > &&
2525 IsBLASCompatible_v< ElementType_t<T3> > &&
2526 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2527 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2528 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2536 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2537 static constexpr
bool UseVectorizedDefaultKernel_v =
2538 ( useOptimizedKernels &&
2539 !IsDiagonal_v<T3> &&
2540 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2541 IsSIMDCombinable_v< ElementType_t<T1>
2545 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2546 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2551 using This = DVecScalarMultExpr<VMM,ST,true>;
2552 using BaseType = DenseVector<This,true>;
2556 using SIMDType = SIMDTrait_t<ElementType>;
2561 using LeftOperand =
const TDVecDMatMultExpr<VT,MT>;
2567 using LT = If_t< evaluateVector, const VRT, VCT >;
2570 using RT = If_t< evaluateMatrix, const MRT, MCT >;
2576 ( !IsDiagonal_v<MT> &&
2577 VT::simdEnabled && MT::simdEnabled &&
2578 IsSIMDCombinable_v<VET,MET,ST> &&
2579 HasSIMDAdd_v<VET,MET> &&
2580 HasSIMDMult_v<VET,MET> );
2624 if( index >=
vector_.size() ) {
2627 return (*
this)[index];
2636 inline size_t size()
const {
2667 template<
typename T >
2668 inline bool canAlias(
const T* alias )
const {
2669 return vector_.canAlias( alias );
2679 template<
typename T >
2680 inline bool isAliased(
const T* alias )
const {
2681 return vector_.isAliased( alias );
2701 RightOperand_t<VMM> A(
vector_.rightOperand() );
2705 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2706 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2729 template<
typename VT1 >
2730 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2736 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2737 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2739 if( right.rows() == 0UL ) {
2743 else if( right.columns() == 0UL ) {
2755 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2770 template<
typename VT1
2774 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2776 if( ( IsDiagonal_v<MT1> ) ||
2777 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2778 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779 selectSmallAssignKernel( y, x, A, scalar );
2781 selectBlasAssignKernel( y, x, A, scalar );
2799 template<
typename VT1
2803 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2805 const size_t M( A.rows() );
2806 const size_t N( A.columns() );
2808 if( IsStrictlyUpper_v<MT1> ) {
2812 if( !IsLower_v<MT1> )
2814 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2815 y[j] = x[0UL] * A(0UL,j);
2819 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2821 if( IsDiagonal_v<MT1> )
2823 y[i] = x[i] * A(i,i) * scalar;
2827 const size_t jbegin( ( IsUpper_v<MT1> )
2828 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2830 const size_t jend( ( IsLower_v<MT1> )
2831 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2835 const size_t jnum( jend - jbegin );
2836 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2838 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2839 y[j ] += x[i] * A(i,j );
2840 y[j+1UL] += x[i] * A(i,j+1UL);
2843 y[jpos] += x[i] * A(i,jpos);
2845 if( IsLower_v<MT1> ) {
2846 y[jend] = x[i] * A(i,jend);
2851 if( IsStrictlyLower_v<MT1> ) {
2855 if( !IsDiagonal_v<MT1> )
2857 const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2858 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2879 template<
typename VT1
2883 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2884 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2886 selectDefaultAssignKernel( y, x, A, scalar );
2904 template<
typename VT1
2908 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2909 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2911 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2913 const size_t M( A.rows() );
2914 const size_t N( A.columns() );
2916 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
2919 const SIMDType factor(
set( scalar ) );
2925 const size_t ibegin( ( IsLower_v<MT1> )
2926 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2928 const size_t iend( ( IsUpper_v<MT1> )
2929 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2935 for(
size_t i=ibegin; i<iend; ++i ) {
2936 const SIMDType x1(
set( x[i] ) );
2937 xmm1 += x1 * A.load(i,j );
2938 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2939 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2940 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2941 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
2942 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
2943 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
2944 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
2947 y.store( j , xmm1*factor );
2948 y.store( j+
SIMDSIZE , xmm2*factor );
2949 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
2950 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
2951 y.store( j+
SIMDSIZE*4UL, xmm5*factor );
2952 y.store( j+
SIMDSIZE*5UL, xmm6*factor );
2953 y.store( j+
SIMDSIZE*6UL, xmm7*factor );
2954 y.store( j+
SIMDSIZE*7UL, xmm8*factor );
2959 const size_t ibegin( ( IsLower_v<MT1> )
2960 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2962 const size_t iend( ( IsUpper_v<MT1> )
2963 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2967 SIMDType xmm1, xmm2, xmm3, xmm4;
2969 for(
size_t i=ibegin; i<iend; ++i ) {
2970 const SIMDType x1(
set( x[i] ) );
2971 xmm1 += x1 * A.load(i,j );
2972 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2973 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2974 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2977 y.store( j , xmm1*factor );
2978 y.store( j+
SIMDSIZE , xmm2*factor );
2979 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
2980 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
2985 const size_t ibegin( ( IsLower_v<MT1> )
2986 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2988 const size_t iend( ( IsUpper_v<MT1> )
2989 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
2993 SIMDType xmm1, xmm2, xmm3;
2995 for(
size_t i=ibegin; i<iend; ++i ) {
2996 const SIMDType x1(
set( x[i] ) );
2997 xmm1 += x1 * A.load(i,j );
2998 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2999 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3002 y.store( j , xmm1*factor );
3003 y.store( j+
SIMDSIZE , xmm2*factor );
3004 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
3009 const size_t ibegin( ( IsLower_v<MT1> )
3010 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3012 const size_t iend( ( IsUpper_v<MT1> )
3013 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3017 SIMDType xmm1, xmm2;
3019 for(
size_t i=ibegin; i<iend; ++i ) {
3020 const SIMDType x1(
set( x[i] ) );
3021 xmm1 += x1 * A.load(i,j );
3025 y.store( j , xmm1*factor );
3026 y.store( j+
SIMDSIZE, xmm2*factor );
3031 const size_t ibegin( ( IsLower_v<MT1> )
3032 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3034 const size_t iend( ( IsUpper_v<MT1> )
3035 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3041 for(
size_t i=ibegin; i<iend; ++i ) {
3042 xmm1 +=
set( x[i] ) * A.load(i,j);
3045 y.store( j, xmm1*factor );
3048 for( ; remainder && j<N; ++j )
3050 const size_t ibegin( ( IsLower_v<MT1> )
3051 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3053 const size_t iend( ( IsUpper_v<MT1> )
3054 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3060 for(
size_t i=ibegin; i<iend; ++i ) {
3061 value += x[i] * A(i,j);
3064 y[j] = value * scalar;
3083 template<
typename VT1
3087 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3088 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3090 selectDefaultAssignKernel( y, x, A, scalar );
3108 template<
typename VT1
3112 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3113 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3115 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3121 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3123 const SIMDType factor(
set( scalar ) );
3129 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3130 for(
size_t ii=0UL; ii<M; ii+=iblock )
3132 const size_t iend(
min( ii+iblock, M ) );
3133 const size_t jtmp(
min( jj+jblock, N ) );
3134 const size_t jend( ( IsLower_v<MT1> )
3135 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3138 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3141 size_t j( ( IsUpper_v<MT1> )
3142 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t i=ii; i<iend; ++i ) {
3150 const SIMDType x1(
set( x[i] ) );
3151 xmm1 += x1 * A.load(i,j );
3152 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3153 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3154 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3155 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3156 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3157 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3158 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3161 y.store( j , y.load(j ) + xmm1*factor );
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t i=ii; i<iend; ++i ) {
3176 const SIMDType x1(
set( x[i] ) );
3177 xmm1 += x1 * A.load(i,j );
3178 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3179 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3180 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3183 y.store( j , y.load(j ) + xmm1*factor );
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t i=ii; i<iend; ++i ) {
3194 const SIMDType x1(
set( x[i] ) );
3195 xmm1 += x1 * A.load(i,j );
3196 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3197 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3200 y.store( j , y.load(j ) + xmm1*factor );
3207 SIMDType xmm1, xmm2;
3209 for(
size_t i=ii; i<iend; ++i ) {
3210 const SIMDType x1(
set( x[i] ) );
3211 xmm1 += x1 * A.load(i,j );
3215 y.store( j , y.load(j ) + xmm1*factor );
3223 for(
size_t i=ii; i<iend; ++i ) {
3224 xmm1 +=
set( x[i] ) * A.load(i,j);
3227 y.store( j, y.load(j) + xmm1*factor );
3230 for( ; remainder && j<jend; ++j )
3234 for(
size_t i=ii; i<iend; ++i ) {
3235 value += x[i] * A(i,j);
3238 y[j] += value * scalar;
3258 template<
typename VT1
3262 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3263 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3265 selectLargeAssignKernel( y, x, A, scalar );
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3284 template<
typename VT1
3288 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3289 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3291 using ET = ElementType_t<VT1>;
3293 if( IsTriangular_v<MT1> ) {
3294 assign( y, scalar * x );
3295 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3298 gemv( y, x, A,
ET(scalar),
ET(0) );
3316 template<
typename VT1 >
3317 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3328 assign( ~lhs, tmp );
3344 template<
typename VT1 >
3345 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3351 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3352 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3354 if( right.rows() == 0UL || right.columns() == 0UL ) {
3366 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3381 template<
typename VT1
3385 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3387 if( ( IsDiagonal_v<MT1> ) ||
3388 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3389 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390 selectSmallAddAssignKernel( y, x, A, scalar );
3392 selectBlasAddAssignKernel( y, x, A, scalar );
3410 template<
typename VT1
3414 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3416 y.addAssign( x * A * scalar );
3434 template<
typename VT1
3438 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3439 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3441 selectDefaultAddAssignKernel( y, x, A, scalar );
3460 template<
typename VT1
3464 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3465 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3467 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3469 const size_t M( A.rows() );
3470 const size_t N( A.columns() );
3472 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
3475 const SIMDType factor(
set( scalar ) );
3481 const size_t ibegin( ( IsLower_v<MT1> )
3482 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3484 const size_t iend( ( IsUpper_v<MT1> )
3485 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3489 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3491 for(
size_t i=ibegin; i<iend; ++i ) {
3492 const SIMDType x1(
set( x[i] ) );
3493 xmm1 += x1 * A.load(i,j );
3494 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3495 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3496 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3497 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3498 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3499 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3500 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3503 y.store( j , y.load(j ) + xmm1*factor );
3515 const size_t ibegin( ( IsLower_v<MT1> )
3516 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3518 const size_t iend( ( IsUpper_v<MT1> )
3519 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3523 SIMDType xmm1, xmm2, xmm3, xmm4;
3525 for(
size_t i=ibegin; i<iend; ++i ) {
3526 const SIMDType x1(
set( x[i] ) );
3527 xmm1 += x1 * A.load(i,j );
3528 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3529 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3530 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3533 y.store( j , y.load(j ) + xmm1*factor );
3541 const size_t ibegin( ( IsLower_v<MT1> )
3542 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3544 const size_t iend( ( IsUpper_v<MT1> )
3545 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3549 SIMDType xmm1, xmm2, xmm3;
3551 for(
size_t i=ibegin; i<iend; ++i ) {
3552 const SIMDType x1(
set( x[i] ) );
3553 xmm1 += x1 * A.load(i,j );
3554 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3555 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3558 y.store( j , y.load(j ) + xmm1*factor );
3565 const size_t ibegin( ( IsLower_v<MT1> )
3566 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3568 const size_t iend( ( IsUpper_v<MT1> )
3569 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3573 SIMDType xmm1, xmm2;
3575 for(
size_t i=ibegin; i<iend; ++i ) {
3576 const SIMDType x1(
set( x[i] ) );
3577 xmm1 += x1 * A.load(i,j );
3581 y.store( j , y.load(j ) + xmm1*factor );
3587 const size_t ibegin( ( IsLower_v<MT1> )
3588 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3590 const size_t iend( ( IsUpper_v<MT1> )
3591 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3597 for(
size_t i=ibegin; i<iend; ++i ) {
3598 xmm1 +=
set( x[i] ) * A.load(i,j);
3601 y.store( j, y.load(j) + xmm1*factor );
3604 for( ; remainder && j<N; ++j )
3606 const size_t ibegin( ( IsLower_v<MT1> )
3607 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3609 const size_t iend( ( IsUpper_v<MT1> )
3610 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3616 for(
size_t i=ibegin; i<iend; ++i ) {
3617 value += x[i] * A(i,j);
3620 y[j] += value * scalar;
3639 template<
typename VT1
3643 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3644 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3646 selectDefaultAddAssignKernel( y, x, A, scalar );
3665 template<
typename VT1
3669 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3670 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3672 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3674 const size_t M( A.rows() );
3675 const size_t N( A.columns() );
3677 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3678 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3680 const SIMDType factor(
set( scalar ) );
3684 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3685 for(
size_t ii=0UL; ii<M; ii+=iblock )
3687 const size_t iend(
min( ii+iblock, M ) );
3688 const size_t jtmp(
min( jj+jblock, N ) );
3689 const size_t jend( ( IsLower_v<MT1> )
3690 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3693 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3696 size_t j( ( IsUpper_v<MT1> )
3697 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
3702 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3704 for(
size_t i=ii; i<iend; ++i ) {
3705 const SIMDType x1(
set( x[i] ) );
3706 xmm1 += x1 * A.load(i,j );
3707 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3708 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3709 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3710 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3711 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3712 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3713 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3716 y.store( j , y.load(j ) + xmm1*factor );
3728 SIMDType xmm1, xmm2, xmm3, xmm4;
3730 for(
size_t i=ii; i<iend; ++i ) {
3731 const SIMDType x1(
set( x[i] ) );
3732 xmm1 += x1 * A.load(i,j );
3733 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3734 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3735 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3738 y.store( j , y.load(j ) + xmm1*factor );
3746 SIMDType xmm1, xmm2, xmm3;
3748 for(
size_t i=ii; i<iend; ++i ) {
3749 const SIMDType x1(
set( x[i] ) );
3750 xmm1 += x1 * A.load(i,j );
3751 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3752 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3755 y.store( j , y.load(j ) + xmm1*factor );
3762 SIMDType xmm1, xmm2;
3764 for(
size_t i=ii; i<iend; ++i ) {
3765 const SIMDType x1(
set( x[i] ) );
3766 xmm1 += x1 * A.load(i,j );
3770 y.store( j , y.load(j ) + xmm1*factor );
3778 for(
size_t i=ii; i<iend; ++i ) {
3779 xmm1 +=
set( x[i] ) * A.load(i,j);
3782 y.store( j, y.load(j) + xmm1*factor );
3785 for( ; remainder && j<jend; ++j )
3789 for(
size_t i=ii; i<iend; ++i ) {
3790 value += x[i] * A(i,j);
3793 y[j] += value * scalar;
3814 template<
typename VT1
3818 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3819 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3821 selectLargeAddAssignKernel( y, x, A, scalar );
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3840 template<
typename VT1
3844 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3845 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3847 using ET = ElementType_t<VT1>;
3849 if( IsTriangular_v<MT1> ) {
3850 ResultType_t<VT1> tmp(
serial( scalar * x ) );
3851 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3852 addAssign( y, tmp );
3855 gemv( y, x, A,
ET(scalar),
ET(1) );
3877 template<
typename VT1 >
3878 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3884 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3885 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3887 if( right.rows() == 0UL || right.columns() == 0UL ) {
3899 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3914 template<
typename VT1
3918 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3920 if( ( IsDiagonal_v<MT1> ) ||
3921 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3922 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923 selectSmallSubAssignKernel( y, x, A, scalar );
3925 selectBlasSubAssignKernel( y, x, A, scalar );
3943 template<
typename VT1
3947 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3949 y.subAssign( x * A * scalar );
3967 template<
typename VT1
3971 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3972 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3974 selectDefaultSubAssignKernel( y, x, A, scalar );
3993 template<
typename VT1
3997 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3998 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4000 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4002 const size_t M( A.rows() );
4003 const size_t N( A.columns() );
4005 const size_t jpos( remainder ? ( N &
size_t(-
SIMDSIZE) ) : N );
4008 const SIMDType factor(
set( scalar ) );
4014 const size_t ibegin( ( IsLower_v<MT1> )
4015 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4017 const size_t iend( ( IsUpper_v<MT1> )
4018 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4022 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4024 for(
size_t i=ibegin; i<iend; ++i ) {
4025 const SIMDType x1(
set( x[i] ) );
4026 xmm1 += x1 * A.load(i,j );
4027 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4028 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4029 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4030 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4031 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4032 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4033 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4036 y.store( j , y.load(j ) - xmm1*factor );
4048 const size_t ibegin( ( IsLower_v<MT1> )
4049 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4051 const size_t iend( ( IsUpper_v<MT1> )
4052 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4056 SIMDType xmm1, xmm2, xmm3, xmm4;
4058 for(
size_t i=ibegin; i<iend; ++i ) {
4059 const SIMDType x1(
set( x[i] ) );
4060 xmm1 += x1 * A.load(i,j );
4061 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4062 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4063 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4066 y.store( j , y.load(j ) - xmm1*factor );
4074 const size_t ibegin( ( IsLower_v<MT1> )
4075 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4077 const size_t iend( ( IsUpper_v<MT1> )
4078 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4082 SIMDType xmm1, xmm2, xmm3;
4084 for(
size_t i=ibegin; i<iend; ++i ) {
4085 const SIMDType x1(
set( x[i] ) );
4086 xmm1 += x1 * A.load(i,j );
4087 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4088 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4091 y.store( j , y.load(j ) - xmm1*factor );
4098 const size_t ibegin( ( IsLower_v<MT1> )
4099 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4101 const size_t iend( ( IsUpper_v<MT1> )
4102 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4106 SIMDType xmm1, xmm2;
4108 for(
size_t i=ibegin; i<iend; ++i ) {
4109 const SIMDType x1(
set( x[i] ) );
4110 xmm1 += x1 * A.load(i,j );
4114 y.store( j , y.load(j ) - xmm1*factor );
4120 const size_t ibegin( ( IsLower_v<MT1> )
4121 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4123 const size_t iend( ( IsUpper_v<MT1> )
4124 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4130 for(
size_t i=ibegin; i<iend; ++i ) {
4131 xmm1 +=
set( x[i] ) * A.load(i,j);
4134 y.store( j, y.load(j) - xmm1*factor );
4137 for( ; remainder && j<N; ++j )
4139 const size_t ibegin( ( IsLower_v<MT1> )
4140 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4142 const size_t iend( ( IsUpper_v<MT1> )
4143 ?(
min( j+1UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4149 for(
size_t i=ibegin; i<iend; ++i ) {
4150 value += x[i] * A(i,j);
4153 y[j] -= value * scalar;
4172 template<
typename VT1
4176 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4177 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4179 selectDefaultSubAssignKernel( y, x, A, scalar );
4198 template<
typename VT1
4202 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4203 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4205 constexpr
bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4207 const size_t M( A.rows() );
4208 const size_t N( A.columns() );
4210 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
4211 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4213 const SIMDType factor(
set( scalar ) );
4217 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4218 for(
size_t ii=0UL; ii<M; ii+=iblock )
4220 const size_t iend(
min( ii+iblock, M ) );
4221 const size_t jtmp(
min( jj+jblock, N ) );
4222 const size_t jend( ( IsLower_v<MT1> )
4223 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4226 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4229 size_t j( ( IsUpper_v<MT1> )
4230 ?(
max( jj, ( IsStrictlyUpper_v<MT1> ? ii+1UL : ii ) &
size_t(-
SIMDSIZE) ) )
4235 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4237 for(
size_t i=ii; i<iend; ++i ) {
4238 const SIMDType x1(
set( x[i] ) );
4239 xmm1 += x1 * A.load(i,j );
4240 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4241 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4242 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4243 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4244 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4245 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4246 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4249 y.store( j , y.load(j ) - xmm1*factor );
4261 SIMDType xmm1, xmm2, xmm3, xmm4;
4263 for(
size_t i=ii; i<iend; ++i ) {
4264 const SIMDType x1(
set( x[i] ) );
4265 xmm1 += x1 * A.load(i,j );
4266 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4267 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4268 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4271 y.store( j , y.load(j ) - xmm1*factor );
4279 SIMDType xmm1, xmm2, xmm3;
4281 for(
size_t i=ii; i<iend; ++i ) {
4282 const SIMDType x1(
set( x[i] ) );
4283 xmm1 += x1 * A.load(i,j );
4284 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4285 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4288 y.store( j , y.load(j ) - xmm1*factor );
4295 SIMDType xmm1, xmm2;
4297 for(
size_t i=ii; i<iend; ++i ) {
4298 const SIMDType x1(
set( x[i] ) );
4299 xmm1 += x1 * A.load(i,j );
4303 y.store( j , y.load(j ) - xmm1*factor );
4311 for(
size_t i=ii; i<iend; ++i ) {
4312 xmm1 +=
set( x[i] ) * A.load(i,j);
4315 y.store( j, y.load(j) - xmm1*factor );
4318 for( ; remainder && j<jend; ++j )
4322 for(
size_t i=ii; i<iend; ++i ) {
4323 value += x[i] * A(i,j);
4326 y[j] -= value * scalar;
4347 template<
typename VT1
4351 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4352 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4354 selectLargeSubAssignKernel( y, x, A, scalar );
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4373 template<
typename VT1
4377 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4378 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4380 using ET = ElementType_t<VT1>;
4382 if( IsTriangular_v<MT1> ) {
4383 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4384 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4385 subAssign( y, tmp );
4388 gemv( y, x, A,
ET(-scalar),
ET(1) );
4410 template<
typename VT1 >
4411 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4422 multAssign( ~lhs, tmp );
4442 template<
typename VT1 >
4443 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4454 divAssign( ~lhs, tmp );
4476 template<
typename VT1 >
4478 -> EnableIf_t< UseSMPAssign_v<VT1> >
4484 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4485 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4487 if( right.rows() == 0UL ) {
4491 else if( right.columns() == 0UL ) {
4521 template<
typename VT1 >
4523 -> EnableIf_t< UseSMPAssign_v<VT1> >
4552 template<
typename VT1 >
4554 -> EnableIf_t< UseSMPAssign_v<VT1> >
4560 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4561 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4563 if( right.rows() == 0UL || right.columns() == 0UL ) {
4597 template<
typename VT1 >
4599 -> EnableIf_t< UseSMPAssign_v<VT1> >
4605 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4606 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4608 if( right.rows() == 0UL || right.columns() == 0UL ) {
4643 template<
typename VT1 >
4645 -> EnableIf_t< UseSMPAssign_v<VT1> >
4678 template<
typename VT1 >
4680 -> EnableIf_t< UseSMPAssign_v<VT1> >
4753 template<
typename VT
4755 inline decltype(
auto)
4762 if( (~vec).
size() != (~mat).
rows() ) {
4794 template<
typename VT
4796 inline decltype(
auto)
4797 operator*( const DenseVector<VT,true>& vec, const MatMatMultExpr<MT>& mat )
4801 return ( vec * (~mat).leftOperand() ) * (~mat).rightOperand();
4817 template<
typename VT,
typename MT >
4818 struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4819 :
public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
If_t< IsExpression_v< MT >, const MT, const MT &> RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:215
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:204
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Header file for basic type definitions.
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:296
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:363
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
Header file for the Computation base class.
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:233
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:329
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:129
Header file for the IsFloat type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:353
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:226
Header file for the IsComplexDouble type trait.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
Headerfile for the generic max algorithm.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:133
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:262
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:218
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:248
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:131
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
Header file for the IsSIMDCombinable type trait.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:206
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:373
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:208
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:239
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:144
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:165
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:139
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:207
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:309
Header file for BLAS general matrix/vector multiplication functions (gemv)
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:341
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:212
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:205
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:319
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
Header file for the IsUpper type trait.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:221
Constraint on the data type.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:134
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:130
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423