35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_ 120 template<
typename MT
122 class TDMatDVecMultExpr
123 :
public MatVecMultExpr< DenseVector< TDMatDVecMultExpr<MT,VT>, false > >
124 ,
private Computation
139 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
140 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
145 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
154 template<
typename T1 >
164 template<
typename T1,
typename T2,
typename T3 >
165 static constexpr
bool UseBlasKernel_v =
167 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
168 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
169 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
171 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
172 IsBLASCompatible_v< ElementType_t<T1> > &&
173 IsBLASCompatible_v< ElementType_t<T2> > &&
174 IsBLASCompatible_v< ElementType_t<T3> > &&
186 template<
typename T1,
typename T2,
typename T3 >
187 static constexpr
bool UseVectorizedDefaultKernel_v =
188 ( useOptimizedKernels &&
190 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
191 IsSIMDCombinable_v< ElementType_t<T1>
226 ( !IsDiagonal_v<MT> &&
227 MT::simdEnabled && VT::simdEnabled &&
228 HasSIMDAdd_v<MET,VET> &&
229 HasSIMDMult_v<MET,VET> );
264 if( IsDiagonal_v<MT> )
266 return mat_(index,index) *
vec_[index];
268 else if( IsLower_v<MT> && ( index + 8UL <
mat_.rows() ) )
270 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
274 else if( IsUpper_v<MT> && ( index > 8UL ) )
276 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
277 const size_t n (
mat_.columns() -
begin );
296 if( index >=
mat_.rows() ) {
299 return (*
this)[index];
308 inline size_t size() const noexcept {
339 template<
typename T >
340 inline bool canAlias(
const T* alias )
const noexcept {
341 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
351 template<
typename T >
352 inline bool isAliased(
const T* alias )
const noexcept {
353 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
363 return mat_.isAligned() &&
vec_.isAligned();
377 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
378 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.mat_.rows() == 0UL ) {
411 else if( rhs.mat_.columns() == 0UL ) {
424 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
445 if( ( IsDiagonal_v<MT1> ) ||
447 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, A, x );
450 selectBlasAssignKernel( y, A, x );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
474 const size_t M( A.rows() );
475 const size_t N( A.columns() );
477 if( IsStrictlyLower_v<MT1> ) {
481 if( !IsUpper_v<MT1> )
483 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
484 y[i] = A(i,0UL) * x[0UL];
488 for(
size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
490 if( IsDiagonal_v<MT1> )
492 y[j] = A(j,j) * x[j];
496 const size_t ibegin( ( IsLower_v<MT1> )
497 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
499 const size_t iend( ( IsUpper_v<MT1> )
500 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
504 const size_t inum( iend - ibegin );
505 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
507 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
508 y[i ] += A(i ,j) * x[j];
509 y[i+1UL] += A(i+1UL,j) * x[j];
512 y[ipos] += A(ipos,j) * x[j];
514 if( IsUpper_v<MT1> ) {
515 y[iend] = A(iend,j) * x[j];
520 if( IsStrictlyUpper_v<MT1> ) {
541 template<
typename VT1
544 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
545 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
547 selectDefaultAssignKernel( y, A, x );
566 template<
typename VT1
569 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
570 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
572 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
574 const size_t M( A.rows() );
575 const size_t N( A.columns() );
577 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
584 const size_t jbegin( ( IsUpper_v<MT1> )
585 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
587 const size_t jend( ( IsLower_v<MT1> )
588 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
592 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
594 for(
size_t j=jbegin; j<jend; ++j ) {
596 xmm1 += A.load(i ,j) * x1;
598 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
599 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
600 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
601 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
602 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
603 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
618 const size_t jbegin( ( IsUpper_v<MT1> )
619 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
621 const size_t jend( ( IsLower_v<MT1> )
622 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
628 for(
size_t j=jbegin; j<jend; ++j ) {
630 xmm1 += A.load(i ,j) * x1;
632 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
633 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
644 const size_t jbegin( ( IsUpper_v<MT1> )
645 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
647 const size_t jend( ( IsLower_v<MT1> )
648 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
654 for(
size_t j=jbegin; j<jend; ++j ) {
656 xmm1 += A.load(i ,j) * x1;
658 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
668 const size_t jbegin( ( IsUpper_v<MT1> )
669 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
671 const size_t jend( ( IsLower_v<MT1> )
672 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
678 for(
size_t j=jbegin; j<jend; ++j ) {
680 xmm1 += A.load(i ,j) * x1;
690 const size_t jbegin( ( IsUpper_v<MT1> )
691 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
693 const size_t jend( ( IsLower_v<MT1> )
694 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
700 for(
size_t j=jbegin; j<jend; ++j ) {
701 xmm1 += A.load(i,j) *
set( x[j] );
707 for( ; remainder && i<M; ++i )
709 const size_t jbegin( ( IsUpper_v<MT1> )
710 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
712 const size_t jend( ( IsLower_v<MT1> )
713 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
719 for(
size_t j=jbegin; j<jend; ++j ) {
720 value += A(i,j) * x[j];
743 template<
typename VT1
746 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
747 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
749 selectDefaultAssignKernel( y, A, x );
768 template<
typename VT1
771 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
772 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
774 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
776 const size_t M( A.rows() );
777 const size_t N( A.columns() );
779 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
780 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
786 for(
size_t ii=0U; ii<M; ii+=iblock ) {
787 for(
size_t jj=0UL; jj<N; jj+=jblock )
789 const size_t jend(
min( jj+jblock, N ) );
790 const size_t itmp(
min( ii+iblock, M ) );
791 const size_t iend( ( IsUpper_v<MT1> )
792 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
795 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
798 size_t i( ( IsLower_v<MT1> )
799 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
804 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
806 for(
size_t j=jj; j<jend; ++j ) {
808 xmm1 += A.load(i ,j) * x1;
810 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
811 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
812 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
813 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
814 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
815 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
818 y.store( i , y.load(i ) + xmm1 );
832 for(
size_t j=jj; j<jend; ++j ) {
834 xmm1 += A.load(i ,j) * x1;
836 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
837 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
840 y.store( i , y.load(i ) + xmm1 );
850 for(
size_t j=jj; j<jend; ++j ) {
852 xmm1 += A.load(i ,j) * x1;
854 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
857 y.store( i , y.load(i ) + xmm1 );
866 for(
size_t j=jj; j<jend; ++j ) {
868 xmm1 += A.load(i ,j) * x1;
872 y.store( i , y.load(i ) + xmm1 );
880 for(
size_t j=jj; j<jend; ++j ) {
881 xmm1 += A.load(i,j) *
set( x[j] );
884 y.store( i, y.load(i) + xmm1 );
887 for( ; remainder && i<iend; ++i )
891 for(
size_t j=jj; j<jend; ++j ) {
892 value += A(i,j) * x[j];
917 template<
typename VT1
920 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
921 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
923 selectLargeAssignKernel( y, A, x );
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 943 template<
typename VT1
946 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
947 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
949 using ET = ElementType_t<VT1>;
951 if( IsTriangular_v<MT1> ) {
953 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
956 gemv( y, A, x, ET(1), ET(0) );
976 template<
typename VT1 >
977 friend inline void assign( SparseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1006 template<
typename VT1 >
1007 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1013 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1025 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1041 template<
typename VT1
1044 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1046 if( ( IsDiagonal_v<MT1> ) ||
1048 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1049 selectSmallAddAssignKernel( y, A, x );
1051 selectBlasAddAssignKernel( y, A, x );
1070 template<
typename VT1
1073 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1075 const size_t M( A.rows() );
1076 const size_t N( A.columns() );
1078 for(
size_t j=0UL; j<N; ++j )
1080 if( IsDiagonal_v<MT1> )
1082 y[j] += A(j,j) * x[j];
1086 const size_t ibegin( ( IsLower_v<MT1> )
1087 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1089 const size_t iend( ( IsUpper_v<MT1> )
1090 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1094 const size_t inum( iend - ibegin );
1095 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1097 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1098 y[i ] += A(i ,j) * x[j];
1099 y[i+1UL] += A(i+1UL,j) * x[j];
1102 y[ipos] += A(ipos,j) * x[j];
1124 template<
typename VT1
1127 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1128 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1130 selectDefaultAddAssignKernel( y, A, x );
1149 template<
typename VT1
1152 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1153 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1155 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1157 const size_t M( A.rows() );
1158 const size_t N( A.columns() );
1160 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
1167 const size_t jbegin( ( IsUpper_v<MT1> )
1168 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1170 const size_t jend( ( IsLower_v<MT1> )
1171 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1184 for(
size_t j=jbegin; j<jend; ++j ) {
1186 xmm1 += A.load(i ,j) * x1;
1187 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1188 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1189 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1190 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
1191 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
1192 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
1193 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
1196 y.store( i , xmm1 );
1208 const size_t jbegin( ( IsUpper_v<MT1> )
1209 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1211 const size_t jend( ( IsLower_v<MT1> )
1212 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1221 for(
size_t j=jbegin; j<jend; ++j ) {
1223 xmm1 += A.load(i ,j) * x1;
1224 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1225 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1226 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1229 y.store( i , xmm1 );
1237 const size_t jbegin( ( IsUpper_v<MT1> )
1238 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1240 const size_t jend( ( IsLower_v<MT1> )
1241 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1249 for(
size_t j=jbegin; j<jend; ++j ) {
1251 xmm1 += A.load(i ,j) * x1;
1252 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1253 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1256 y.store( i , xmm1 );
1263 const size_t jbegin( ( IsUpper_v<MT1> )
1264 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1266 const size_t jend( ( IsLower_v<MT1> )
1267 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1274 for(
size_t j=jbegin; j<jend; ++j ) {
1276 xmm1 += A.load(i ,j) * x1;
1280 y.store( i , xmm1 );
1286 const size_t jbegin( ( IsUpper_v<MT1> )
1287 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1289 const size_t jend( ( IsLower_v<MT1> )
1290 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1296 for(
size_t j=jbegin; j<jend; ++j ) {
1297 xmm1 += A.load(i,j) *
set( x[j] );
1303 for( ; remainder && i<M; ++i )
1305 const size_t jbegin( ( IsUpper_v<MT1> )
1306 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1308 const size_t jend( ( IsLower_v<MT1> )
1309 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1315 for(
size_t j=jbegin; j<jend; ++j ) {
1316 value += A(i,j) * x[j];
1339 template<
typename VT1
1342 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1343 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1345 selectDefaultAddAssignKernel( y, A, x );
1364 template<
typename VT1
1367 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1368 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1370 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1372 const size_t M( A.rows() );
1373 const size_t N( A.columns() );
1375 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1376 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1380 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1381 for(
size_t jj=0UL; jj<N; jj+=jblock )
1383 const size_t jend(
min( jj+jblock, N ) );
1384 const size_t itmp(
min( ii+iblock, M ) );
1385 const size_t iend( ( IsUpper_v<MT1> )
1386 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
1389 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1392 size_t i( ( IsLower_v<MT1> )
1393 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
1398 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1400 for(
size_t j=jj; j<jend; ++j ) {
1402 xmm1 += A.load(i ,j) * x1;
1403 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1404 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1405 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1406 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
1407 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
1408 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
1409 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
1412 y.store( i , y.load(i ) + xmm1 );
1426 for(
size_t j=jj; j<jend; ++j ) {
1428 xmm1 += A.load(i ,j) * x1;
1429 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1430 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1431 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1434 y.store( i , y.load(i ) + xmm1 );
1444 for(
size_t j=jj; j<jend; ++j ) {
1446 xmm1 += A.load(i ,j) * x1;
1447 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1448 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1451 y.store( i , y.load(i ) + xmm1 );
1460 for(
size_t j=jj; j<jend; ++j ) {
1462 xmm1 += A.load(i ,j) * x1;
1466 y.store( i , y.load(i ) + xmm1 );
1474 for(
size_t j=jj; j<jend; ++j ) {
1475 xmm1 += A.load(i,j) *
set( x[j] );
1478 y.store( i, y.load(i) + xmm1 );
1481 for( ; remainder && i<iend; ++i )
1485 for(
size_t j=jj; j<jend; ++j ) {
1486 value += A(i,j) * x[j];
1511 template<
typename VT1
1514 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1515 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1517 selectLargeAddAssignKernel( y, A, x );
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1537 template<
typename VT1
1540 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1541 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1543 using ET = ElementType_t<VT1>;
1545 if( IsTriangular_v<MT1> ) {
1546 ResultType_t<VT1> tmp(
serial( x ) );
1547 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1548 addAssign( y, tmp );
1551 gemv( y, A, x, ET(1), ET(1) );
1575 template<
typename VT1 >
1576 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1582 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1594 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1610 template<
typename VT1
1613 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1615 if( ( IsDiagonal_v<MT1> ) ||
1617 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1618 selectSmallSubAssignKernel( y, A, x );
1620 selectBlasSubAssignKernel( y, A, x );
1639 template<
typename VT1
1642 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1644 const size_t M( A.rows() );
1645 const size_t N( A.columns() );
1647 for(
size_t j=0UL; j<N; ++j )
1649 if( IsDiagonal_v<MT1> )
1651 y[j] -= A(j,j) * x[j];
1655 const size_t ibegin( ( IsLower_v<MT1> )
1656 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1658 const size_t iend( ( IsUpper_v<MT1> )
1659 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1663 const size_t inum( iend - ibegin );
1664 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1666 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1667 y[i ] -= A(i ,j) * x[j];
1668 y[i+1UL] -= A(i+1UL,j) * x[j];
1671 y[ipos] -= A(ipos,j) * x[j];
1693 template<
typename VT1
1696 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1697 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1699 selectDefaultSubAssignKernel( y, A, x );
1719 template<
typename VT1
1722 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1723 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1725 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1727 const size_t M( A.rows() );
1728 const size_t N( A.columns() );
1730 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
1737 const size_t jbegin( ( IsUpper_v<MT1> )
1738 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1740 const size_t jend( ( IsLower_v<MT1> )
1741 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1754 for(
size_t j=jbegin; j<jend; ++j ) {
1756 xmm1 -= A.load(i ,j) * x1;
1757 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1758 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1759 xmm4 -= A.load(i+
SIMDSIZE*3UL,j) * x1;
1760 xmm5 -= A.load(i+
SIMDSIZE*4UL,j) * x1;
1761 xmm6 -= A.load(i+
SIMDSIZE*5UL,j) * x1;
1762 xmm7 -= A.load(i+
SIMDSIZE*6UL,j) * x1;
1763 xmm8 -= A.load(i+
SIMDSIZE*7UL,j) * x1;
1766 y.store( i , xmm1 );
1778 const size_t jbegin( ( IsUpper_v<MT1> )
1779 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1781 const size_t jend( ( IsLower_v<MT1> )
1782 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1791 for(
size_t j=jbegin; j<jend; ++j ) {
1793 xmm1 -= A.load(i ,j) * x1;
1794 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1795 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1796 xmm4 -= A.load(i+
SIMDSIZE*3UL,j) * x1;
1799 y.store( i , xmm1 );
1807 const size_t jbegin( ( IsUpper_v<MT1> )
1808 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1810 const size_t jend( ( IsLower_v<MT1> )
1811 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1819 for(
size_t j=jbegin; j<jend; ++j ) {
1821 xmm1 -= A.load(i ,j) * x1;
1822 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1823 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1826 y.store( i , xmm1 );
1833 const size_t jbegin( ( IsUpper_v<MT1> )
1834 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1836 const size_t jend( ( IsLower_v<MT1> )
1837 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1844 for(
size_t j=jbegin; j<jend; ++j ) {
1846 xmm1 -= A.load(i ,j) * x1;
1850 y.store( i , xmm1 );
1856 const size_t jbegin( ( IsUpper_v<MT1> )
1857 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1859 const size_t jend( ( IsLower_v<MT1> )
1860 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1866 for(
size_t j=jbegin; j<jend; ++j ) {
1867 xmm1 -= A.load(i,j) *
set( x[j] );
1873 for( ; remainder && i<M; ++i )
1875 const size_t jbegin( ( IsUpper_v<MT1> )
1876 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1878 const size_t jend( ( IsLower_v<MT1> )
1879 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1885 for(
size_t j=jbegin; j<jend; ++j ) {
1886 value += A(i,j) * x[j];
1909 template<
typename VT1
1912 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1913 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1915 selectDefaultSubAssignKernel( y, A, x );
1935 template<
typename VT1
1938 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1939 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1941 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1943 const size_t M( A.rows() );
1944 const size_t N( A.columns() );
1946 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1947 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1951 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1952 for(
size_t jj=0UL; jj<N; jj+=jblock )
1954 const size_t jend(
min( jj+jblock, N ) );
1955 const size_t itmp(
min( ii+iblock, M ) );
1956 const size_t iend( ( IsUpper_v<MT1> )
1957 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
1960 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1963 size_t i( ( IsLower_v<MT1> )
1964 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
1969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1971 for(
size_t j=jj; j<jend; ++j ) {
1973 xmm1 += A.load(i ,j) * x1;
1974 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1975 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1976 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1977 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
1978 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
1979 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
1980 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
1983 y.store( i , y.load(i ) - xmm1 );
1997 for(
size_t j=jj; j<jend; ++j ) {
1999 xmm1 += A.load(i ,j) * x1;
2000 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2001 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2002 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
2005 y.store( i , y.load(i ) - xmm1 );
2015 for(
size_t j=jj; j<jend; ++j ) {
2017 xmm1 += A.load(i ,j) * x1;
2018 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2019 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2022 y.store( i , y.load(i ) - xmm1 );
2031 for(
size_t j=jj; j<jend; ++j ) {
2033 xmm1 += A.load(i ,j) * x1;
2037 y.store( i , y.load(i ) - xmm1 );
2045 for(
size_t j=jj; j<jend; ++j ) {
2046 xmm1 += A.load(i,j) *
set( x[j] );
2049 y.store( i, y.load(i) - xmm1 );
2052 for( ; remainder && i<iend; ++i )
2056 for(
size_t j=jj; j<jend; ++j ) {
2057 value += A(i,j) * x[j];
2082 template<
typename VT1
2085 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2086 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2088 selectLargeSubAssignKernel( y, A, x );
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2108 template<
typename VT1
2111 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2112 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2114 using ET = ElementType_t<VT1>;
2116 if( IsTriangular_v<MT1> ) {
2117 ResultType_t<VT1> tmp(
serial( x ) );
2118 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2119 subAssign( y, tmp );
2122 gemv( y, A, x, ET(-1), ET(1) );
2146 template<
typename VT1 >
2147 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2158 multAssign( ~lhs, tmp );
2180 template<
typename VT1 >
2181 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2192 divAssign( ~lhs, tmp );
2216 template<
typename VT1 >
2218 -> EnableIf_t< UseSMPAssign_v<VT1> >
2224 if( rhs.mat_.rows() == 0UL ) {
2227 else if( rhs.mat_.columns() == 0UL ) {
2260 template<
typename VT1 >
2262 -> EnableIf_t< UseSMPAssign_v<VT1> >
2293 template<
typename VT1 >
2295 -> EnableIf_t< UseSMPAssign_v<VT1> >
2301 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2337 template<
typename VT1 >
2339 -> EnableIf_t< UseSMPAssign_v<VT1> >
2345 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2381 template<
typename VT1 >
2383 -> EnableIf_t< UseSMPAssign_v<VT1> >
2418 template<
typename VT1 >
2420 -> EnableIf_t< UseSMPAssign_v<VT1> >
2470 template<
typename MT
2473 class DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >
2474 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false > >
2475 ,
private Computation
2479 using MVM = TDMatDVecMultExpr<MT,VT>;
2480 using RES = ResultType_t<MVM>;
2481 using MRT = ResultType_t<MT>;
2482 using VRT = ResultType_t<VT>;
2483 using MET = ElementType_t<MRT>;
2484 using VET = ElementType_t<VRT>;
2485 using MCT = CompositeType_t<MT>;
2486 using VCT = CompositeType_t<VT>;
2491 static constexpr
bool evaluateMatrix =
2492 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2493 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2498 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2506 template<
typename T1 >
2507 static constexpr
bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
2514 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2515 static constexpr
bool UseBlasKernel_v =
2517 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2518 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2519 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2520 !IsDiagonal_v<T2> &&
2521 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2522 IsBLASCompatible_v< ElementType_t<T1> > &&
2523 IsBLASCompatible_v< ElementType_t<T2> > &&
2524 IsBLASCompatible_v< ElementType_t<T3> > &&
2525 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2526 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2527 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2535 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2536 static constexpr
bool UseVectorizedDefaultKernel_v =
2537 ( useOptimizedKernels &&
2538 !IsDiagonal_v<T2> &&
2539 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2540 IsSIMDCombinable_v< ElementType_t<T1>
2544 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2545 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2550 using This = DVecScalarMultExpr<MVM,ST,false>;
2551 using BaseType = DenseVector<This,false>;
2555 using SIMDType = SIMDTrait_t<ElementType>;
2560 using LeftOperand =
const TDMatDVecMultExpr<MT,VT>;
2566 using LT = If_t< evaluateMatrix, const MRT, MCT >;
2569 using RT = If_t< evaluateVector, const VRT, VCT >;
2575 ( !IsDiagonal_v<MT> &&
2576 MT::simdEnabled && VT::simdEnabled &&
2577 IsSIMDCombinable_v<MET,VET,ST> &&
2578 HasSIMDAdd_v<MET,VET> &&
2579 HasSIMDMult_v<MET,VET> );
2623 if( index >=
vector_.size() ) {
2626 return (*
this)[index];
2635 inline size_t size()
const {
2666 template<
typename T >
2667 inline bool canAlias(
const T* alias )
const {
2668 return vector_.canAlias( alias );
2678 template<
typename T >
2679 inline bool isAliased(
const T* alias )
const {
2680 return vector_.isAliased( alias );
2700 LeftOperand_t<MVM> A(
vector_.leftOperand() );
2704 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2705 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2706 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2728 template<
typename VT1 >
2729 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2735 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
2736 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
2738 if( left.rows() == 0UL ) {
2741 else if( left.columns() == 0UL ) {
2754 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2769 template<
typename VT1
2773 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2775 if( ( IsDiagonal_v<MT1> ) ||
2776 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2777 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2778 selectSmallAssignKernel( y, A, x, scalar );
2780 selectBlasAssignKernel( y, A, x, scalar );
2798 template<
typename VT1
2802 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2804 const size_t M( A.rows() );
2805 const size_t N( A.columns() );
2807 if( IsStrictlyLower_v<MT1> ) {
2811 if( !IsUpper_v<MT1> )
2813 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
2814 y[i] = A(i,0UL) * x[0UL];
2818 for(
size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
2820 if( IsDiagonal_v<MT1> )
2822 y[j] = A(j,j) * x[j] * scalar;
2826 const size_t ibegin( ( IsLower_v<MT1> )
2827 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2829 const size_t iend( ( IsUpper_v<MT1> )
2830 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
2834 const size_t inum( iend - ibegin );
2835 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2837 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2838 y[i ] += A(i ,j) * x[j];
2839 y[i+1UL] += A(i+1UL,j) * x[j];
2842 y[ipos] += A(ipos,j) * x[j];
2844 if( IsUpper_v<MT1> ) {
2845 y[iend] = A(iend,j) * x[j];
2850 if( IsStrictlyUpper_v<MT1> ) {
2854 if( !IsDiagonal_v<MT1> )
2856 const size_t iend( IsStrictlyUpper_v<MT1> ? M-1UL : M );
2857 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<iend; ++i ) {
2878 template<
typename VT1
2882 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2883 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2885 selectDefaultAssignKernel( y, A, x, scalar );
2903 template<
typename VT1
2907 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2908 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2910 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
2912 const size_t M( A.rows() );
2913 const size_t N( A.columns() );
2915 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
2918 const SIMDType factor(
set( scalar ) );
2924 const size_t jbegin( ( IsUpper_v<MT1> )
2925 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2927 const size_t jend( ( IsLower_v<MT1> )
2928 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
2932 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t j=jbegin; j<jend; ++j ) {
2935 const SIMDType x1(
set( x[j] ) );
2936 xmm1 += A.load(i ,j) * x1;
2937 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2938 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2939 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
2940 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
2941 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
2942 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
2943 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
2946 y.store( i , xmm1*factor );
2947 y.store( i+
SIMDSIZE , xmm2*factor );
2948 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
2949 y.store( i+
SIMDSIZE*3UL, xmm4*factor );
2950 y.store( i+
SIMDSIZE*4UL, xmm5*factor );
2951 y.store( i+
SIMDSIZE*5UL, xmm6*factor );
2952 y.store( i+
SIMDSIZE*6UL, xmm7*factor );
2953 y.store( i+
SIMDSIZE*7UL, xmm8*factor );
2958 const size_t jbegin( ( IsUpper_v<MT1> )
2959 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2961 const size_t jend( ( IsLower_v<MT1> )
2962 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
2966 SIMDType xmm1, xmm2, xmm3, xmm4;
2968 for(
size_t j=jbegin; j<jend; ++j ) {
2969 const SIMDType x1(
set( x[j] ) );
2970 xmm1 += A.load(i ,j) * x1;
2971 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2972 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2973 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
2976 y.store( i , xmm1*factor );
2977 y.store( i+
SIMDSIZE , xmm2*factor );
2978 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
2979 y.store( i+
SIMDSIZE*3UL, xmm4*factor );
2984 const size_t jbegin( ( IsUpper_v<MT1> )
2985 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2987 const size_t jend( ( IsLower_v<MT1> )
2988 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
2992 SIMDType xmm1, xmm2, xmm3;
2994 for(
size_t j=jbegin; j<jend; ++j ) {
2995 const SIMDType x1(
set( x[j] ) );
2996 xmm1 += A.load(i ,j) * x1;
2997 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2998 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3001 y.store( i , xmm1*factor );
3002 y.store( i+
SIMDSIZE , xmm2*factor );
3003 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
3008 const size_t jbegin( ( IsUpper_v<MT1> )
3009 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3011 const size_t jend( ( IsLower_v<MT1> )
3012 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3016 SIMDType xmm1, xmm2;
3018 for(
size_t j=jbegin; j<jend; ++j ) {
3019 const SIMDType x1(
set( x[j] ) );
3020 xmm1 += A.load(i ,j) * x1;
3024 y.store( i , xmm1*factor );
3025 y.store( i+
SIMDSIZE, xmm2*factor );
3030 const size_t jbegin( ( IsUpper_v<MT1> )
3031 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3033 const size_t jend( ( IsLower_v<MT1> )
3034 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3040 for(
size_t j=jbegin; j<jend; ++j ) {
3041 const SIMDType x1(
set( x[j] ) );
3042 xmm1 += A.load(i,j) * x1;
3045 y.store( i, xmm1*factor );
3048 for( ; remainder && i<M; ++i )
3050 const size_t jbegin( ( IsUpper_v<MT1> )
3051 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3053 const size_t jend( ( IsLower_v<MT1> )
3054 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3060 for(
size_t j=jbegin; j<jend; ++j ) {
3061 value += A(i,j) * x[j];
3064 y[i] = value * scalar;
3083 template<
typename VT1
3087 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3088 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3090 selectDefaultAssignKernel( y, A, x, scalar );
3108 template<
typename VT1
3112 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3113 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3115 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3117 const size_t M( A.rows() );
3118 const size_t N( A.columns() );
3120 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3121 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3125 const SIMDType factor(
set( scalar ) );
3129 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3130 for(
size_t jj=0UL; jj<N; jj+=jblock )
3132 const size_t jend(
min( jj+jblock, N ) );
3133 const size_t itmp(
min( ii+iblock, M ) );
3134 const size_t iend( ( IsUpper_v<MT1> )
3135 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3138 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3141 size_t i( ( IsLower_v<MT1> )
3142 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t j=jj; j<jend; ++j ) {
3150 const SIMDType x1(
set( x[j] ) );
3151 xmm1 += A.load(i ,j) * x1;
3152 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3153 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3154 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3155 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3156 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3157 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3158 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3161 y.store( i , y.load(i ) + xmm1*factor );
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t j=jj; j<jend; ++j ) {
3176 const SIMDType x1(
set( x[j] ) );
3177 xmm1 += A.load(i ,j) * x1;
3178 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3179 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3180 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3183 y.store( i , y.load(i ) + xmm1*factor );
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t j=jj; j<jend; ++j ) {
3194 const SIMDType x1(
set( x[j] ) );
3195 xmm1 += A.load(i ,j) * x1;
3196 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3197 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3200 y.store( i , y.load(i ) + xmm1*factor );
3207 SIMDType xmm1, xmm2;
3209 for(
size_t j=jj; j<jend; ++j ) {
3210 const SIMDType x1(
set( x[j] ) );
3211 xmm1 += A.load(i ,j) * x1;
3215 y.store( i , y.load(i ) + xmm1*factor );
3223 for(
size_t j=jj; j<jend; ++j ) {
3224 xmm1 += A.load(i,j) *
set( x[j] );
3227 y.store( i, y.load(i) + xmm1*factor );
3230 for( ; remainder && i<iend; ++i )
3234 for(
size_t j=jj; j<jend; ++j ) {
3235 value += A(i,j) * x[j];
3238 y[i] += value * scalar;
3259 template<
typename VT1
3263 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3264 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3266 selectLargeAssignKernel( y, A, x, scalar );
3271 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3285 template<
typename VT1
3289 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3290 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3292 using ET = ElementType_t<VT1>;
3294 if( IsTriangular_v<MT1> ) {
3295 assign( y, scalar * x );
3296 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3299 gemv( y, A, x,
ET(scalar),
ET(0) );
3317 template<
typename VT1 >
3318 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3329 assign( ~lhs, tmp );
3345 template<
typename VT1 >
3346 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3352 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3353 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3355 if( left.rows() == 0UL || left.columns() == 0UL ) {
3367 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3382 template<
typename VT1
3386 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3388 if( ( IsDiagonal_v<MT1> ) ||
3389 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3390 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3391 selectSmallAddAssignKernel( y, A, x, scalar );
3393 selectBlasAddAssignKernel( y, A, x, scalar );
3411 template<
typename VT1
3415 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3417 y.addAssign( A * x * scalar );
3435 template<
typename VT1
3439 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3440 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3442 selectDefaultAddAssignKernel( y, A, x, scalar );
3461 template<
typename VT1
3465 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3466 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3468 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3470 const size_t M( A.rows() );
3471 const size_t N( A.columns() );
3473 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
3476 const SIMDType factor(
set( scalar ) );
3482 const size_t jbegin( ( IsUpper_v<MT1> )
3483 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3485 const size_t jend( ( IsLower_v<MT1> )
3486 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3490 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3492 for(
size_t j=jbegin; j<jend; ++j ) {
3493 const SIMDType x1(
set( x[j] ) );
3494 xmm1 += A.load(i ,j) * x1;
3495 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3496 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3497 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3498 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3499 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3500 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3501 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3504 y.store( i , y.load(i ) + xmm1*factor );
3516 const size_t jbegin( ( IsUpper_v<MT1> )
3517 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3519 const size_t jend( ( IsLower_v<MT1> )
3520 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3524 SIMDType xmm1, xmm2, xmm3, xmm4;
3526 for(
size_t j=jbegin; j<jend; ++j ) {
3527 const SIMDType x1(
set( x[j] ) );
3528 xmm1 += A.load(i ,j) * x1;
3529 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3530 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3531 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3534 y.store( i , y.load(i ) + xmm1*factor );
3542 const size_t jbegin( ( IsUpper_v<MT1> )
3543 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3545 const size_t jend( ( IsLower_v<MT1> )
3546 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3550 SIMDType xmm1, xmm2, xmm3;
3552 for(
size_t j=jbegin; j<jend; ++j ) {
3553 const SIMDType x1(
set( x[j] ) );
3554 xmm1 += A.load(i ,j) * x1;
3555 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3556 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3559 y.store( i , y.load(i ) + xmm1*factor );
3566 const size_t jbegin( ( IsUpper_v<MT1> )
3567 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3569 const size_t jend( ( IsLower_v<MT1> )
3570 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3574 SIMDType xmm1, xmm2;
3576 for(
size_t j=jbegin; j<jend; ++j ) {
3577 const SIMDType x1(
set( x[j] ) );
3578 xmm1 += A.load(i ,j) * x1;
3582 y.store( i , y.load(i ) + xmm1*factor );
3588 const size_t jbegin( ( IsUpper_v<MT1> )
3589 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3591 const size_t jend( ( IsLower_v<MT1> )
3592 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3598 for(
size_t j=jbegin; j<jend; ++j ) {
3599 xmm1 += A.load(i,j) *
set( x[j] );
3602 y.store( i, y.load(i) + xmm1*factor );
3605 for( ; remainder && i<M; ++i )
3607 const size_t jbegin( ( IsUpper_v<MT1> )
3608 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3610 const size_t jend( ( IsLower_v<MT1> )
3611 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3617 for(
size_t j=jbegin; j<jend; ++j ) {
3618 value += A(i,j) * x[j];
3621 y[i] += value * scalar;
3640 template<
typename VT1
3644 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3645 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3647 selectDefaultAddAssignKernel( y, A, x, scalar );
3666 template<
typename VT1
3670 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3671 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3673 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3675 const size_t M( A.rows() );
3676 const size_t N( A.columns() );
3678 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3679 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3683 const SIMDType factor(
set( scalar ) );
3685 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3686 for(
size_t jj=0UL; jj<N; jj+=jblock )
3688 const size_t jend(
min( jj+jblock, N ) );
3689 const size_t itmp(
min( ii+iblock, M ) );
3690 const size_t iend( ( IsUpper_v<MT1> )
3691 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3694 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3697 size_t i( ( IsLower_v<MT1> )
3698 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
3703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705 for(
size_t j=jj; j<jend; ++j ) {
3706 const SIMDType x1(
set( x[j] ) );
3707 xmm1 += A.load(i ,j) * x1;
3708 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3709 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3710 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3711 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3712 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3713 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3714 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3717 y.store( i , y.load(i ) + xmm1*factor );
3729 SIMDType xmm1, xmm2, xmm3, xmm4;
3731 for(
size_t j=jj; j<jend; ++j ) {
3732 const SIMDType x1(
set( x[j] ) );
3733 xmm1 += A.load(i ,j) * x1;
3734 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3735 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3736 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3739 y.store( i , y.load(i ) + xmm1*factor );
3747 SIMDType xmm1, xmm2, xmm3;
3749 for(
size_t j=jj; j<jend; ++j ) {
3750 const SIMDType x1(
set( x[j] ) );
3751 xmm1 += A.load(i ,j) * x1;
3752 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3753 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3756 y.store( i , y.load(i ) + xmm1*factor );
3763 SIMDType xmm1, xmm2;
3765 for(
size_t j=jj; j<jend; ++j ) {
3766 const SIMDType x1(
set( x[j] ) );
3767 xmm1 += A.load(i ,j) * x1;
3771 y.store( i , y.load(i ) + xmm1*factor );
3779 for(
size_t j=jj; j<jend; ++j ) {
3780 xmm1 += A.load(i,j) *
set( x[j] );
3783 y.store( i, y.load(i) + xmm1*factor );
3786 for( ; remainder && i<iend; ++i )
3790 for(
size_t j=jj; j<jend; ++j ) {
3791 value += A(i,j) * x[j];
3794 y[i] += value * scalar;
3815 template<
typename VT1
3819 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3820 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3822 selectLargeAddAssignKernel( y, A, x, scalar );
3827 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3841 template<
typename VT1
3845 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3846 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3848 using ET = ElementType_t<VT1>;
3850 if( IsTriangular_v<MT1> ) {
3851 ResultType_t<VT1> tmp(
serial( scalar * x ) );
3852 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3853 addAssign( y, tmp );
3856 gemv( y, A, x,
ET(scalar),
ET(1) );
3878 template<
typename VT1 >
3879 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3885 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3886 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3888 if( left.rows() == 0UL || left.columns() == 0UL ) {
3900 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
3915 template<
typename VT1
3919 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3921 if( ( IsDiagonal_v<MT1> ) ||
3922 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3923 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3924 selectSmallSubAssignKernel( y, A, x, scalar );
3926 selectBlasSubAssignKernel( y, A, x, scalar );
3944 template<
typename VT1
3948 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3950 y.subAssign( A * x * scalar );
3968 template<
typename VT1
3972 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3973 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3975 selectDefaultSubAssignKernel( y, A, x, scalar );
3994 template<
typename VT1
3998 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3999 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4001 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4003 const size_t M( A.rows() );
4004 const size_t N( A.columns() );
4006 const size_t ipos( remainder ? ( M &
size_t(-
SIMDSIZE) ) : M );
4009 const SIMDType factor(
set( scalar ) );
4015 const size_t jbegin( ( IsUpper_v<MT1> )
4016 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4018 const size_t jend( ( IsLower_v<MT1> )
4019 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4023 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4025 for(
size_t j=jbegin; j<jend; ++j ) {
4026 const SIMDType x1(
set( x[j] ) );
4027 xmm1 += A.load(i ,j) * x1;
4028 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4029 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4030 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4031 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
4032 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
4033 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
4034 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
4037 y.store( i , y.load(i ) - xmm1*factor );
4049 const size_t jbegin( ( IsUpper_v<MT1> )
4050 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4052 const size_t jend( ( IsLower_v<MT1> )
4053 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4057 SIMDType xmm1, xmm2, xmm3, xmm4;
4059 for(
size_t j=jbegin; j<jend; ++j ) {
4060 const SIMDType x1(
set( x[j] ) );
4061 xmm1 += A.load(i ,j) * x1;
4062 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4063 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4064 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4067 y.store( i , y.load(i ) - xmm1*factor );
4075 const size_t jbegin( ( IsUpper_v<MT1> )
4076 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4078 const size_t jend( ( IsLower_v<MT1> )
4079 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4083 SIMDType xmm1, xmm2, xmm3;
4085 for(
size_t j=jbegin; j<jend; ++j ) {
4086 const SIMDType x1(
set( x[j] ) );
4087 xmm1 += A.load(i ,j) * x1;
4088 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4089 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4092 y.store( i , y.load(i ) - xmm1*factor );
4099 const size_t jbegin( ( IsUpper_v<MT1> )
4100 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4102 const size_t jend( ( IsLower_v<MT1> )
4103 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4107 SIMDType xmm1, xmm2;
4109 for(
size_t j=jbegin; j<jend; ++j ) {
4110 const SIMDType x1(
set( x[j] ) );
4111 xmm1 += A.load(i ,j) * x1;
4115 y.store( i , y.load(i ) - xmm1*factor );
4121 const size_t jbegin( ( IsUpper_v<MT1> )
4122 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4124 const size_t jend( ( IsLower_v<MT1> )
4125 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4131 for(
size_t j=jbegin; j<jend; ++j ) {
4132 xmm1 += A.load(i,j) *
set( x[j] );
4135 y.store( i, y.load(i) - xmm1*factor );
4138 for( ; remainder && i<M; ++i )
4140 const size_t jbegin( ( IsUpper_v<MT1> )
4141 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4143 const size_t jend( ( IsLower_v<MT1> )
4144 ?(
min( i+1UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4150 for(
size_t j=jbegin; j<jend; ++j ) {
4151 value += A(i,j) * x[j];
4154 y[i] -= value * scalar;
4173 template<
typename VT1
4177 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4178 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4180 selectDefaultSubAssignKernel( y, A, x, scalar );
4199 template<
typename VT1
4203 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4204 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4206 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4208 const size_t M( A.rows() );
4209 const size_t N( A.columns() );
4211 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
4212 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4216 const SIMDType factor(
set( scalar ) );
4218 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4219 for(
size_t jj=0UL; jj<N; jj+=jblock )
4221 const size_t jend(
min( jj+jblock, N ) );
4222 const size_t itmp(
min( ii+iblock, M ) );
4223 const size_t iend( ( IsUpper_v<MT1> )
4224 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
4227 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4230 size_t i( ( IsLower_v<MT1> )
4231 ?(
max( ii, ( IsStrictlyLower_v<MT1> ? jj+1UL : jj ) &
size_t(-
SIMDSIZE) ) )
4236 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4238 for(
size_t j=jj; j<jend; ++j ) {
4239 const SIMDType x1(
set( x[j] ) );
4240 xmm1 += A.load(i ,j) * x1;
4241 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4242 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4243 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4244 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
4245 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
4246 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
4247 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
4250 y.store( i , y.load(i ) - xmm1*factor );
4262 SIMDType xmm1, xmm2, xmm3, xmm4;
4264 for(
size_t j=jj; j<jend; ++j ) {
4265 const SIMDType x1(
set( x[j] ) );
4266 xmm1 += A.load(i ,j) * x1;
4267 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4268 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4269 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4272 y.store( i , y.load(i ) - xmm1*factor );
4280 SIMDType xmm1, xmm2, xmm3;
4282 for(
size_t j=jj; j<jend; ++j ) {
4283 const SIMDType x1(
set( x[j] ) );
4284 xmm1 += A.load(i ,j) * x1;
4285 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4286 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4289 y.store( i , y.load(i ) - xmm1*factor );
4296 SIMDType xmm1, xmm2;
4298 for(
size_t j=jj; j<jend; ++j ) {
4299 const SIMDType x1(
set( x[j] ) );
4300 xmm1 += A.load(i ,j) * x1;
4304 y.store( i , y.load(i ) - xmm1*factor );
4312 for(
size_t j=jj; j<jend; ++j ) {
4313 xmm1 += A.load(i,j) *
set( x[j] );
4316 y.store( i, y.load(i) - xmm1*factor );
4319 for( ; remainder && i<iend; ++i )
4323 for(
size_t j=jj; j<jend; ++j ) {
4324 value += A(i,j) * x[j];
4327 y[i] -= value * scalar;
4348 template<
typename VT1
4352 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4353 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4355 selectLargeSubAssignKernel( y, A, x, scalar );
4360 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4374 template<
typename VT1
4378 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4379 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4381 using ET = ElementType_t<VT1>;
4383 if( IsTriangular_v<MT1> ) {
4384 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4385 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4386 subAssign( y, tmp );
4389 gemv( y, A, x,
ET(-scalar),
ET(1) );
4411 template<
typename VT1 >
4412 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4423 multAssign( ~lhs, tmp );
4443 template<
typename VT1 >
4444 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4455 divAssign( ~lhs, tmp );
4477 template<
typename VT1 >
4479 -> EnableIf_t< UseSMPAssign_v<VT1> >
4485 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4486 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4488 if( left.rows() == 0UL ) {
4491 else if( left.columns() == 0UL ) {
4522 template<
typename VT1 >
4524 -> EnableIf_t< UseSMPAssign_v<VT1> >
4553 template<
typename VT1 >
4555 -> EnableIf_t< UseSMPAssign_v<VT1> >
4561 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4562 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4564 if( left.rows() == 0UL || left.columns() == 0UL ) {
4598 template<
typename VT1 >
4600 -> EnableIf_t< UseSMPAssign_v<VT1> >
4606 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4607 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4609 if( left.rows() == 0UL || left.columns() == 0UL ) {
4644 template<
typename VT1 >
4646 -> EnableIf_t< UseSMPAssign_v<VT1> >
4679 template<
typename VT1 >
4681 -> EnableIf_t< UseSMPAssign_v<VT1> >
4754 template<
typename MT
4756 inline decltype(
auto)
4783 template<
typename MT,
typename VT >
4784 struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4785 :
public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Header file for auxiliary alias declarations.
Headerfile for the generic min algorithm.
Header file for the blaze::checked and blaze::unchecked instances.
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:207
If_t< IsExpression_v< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:214
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:295
Header file for basic type definitions.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:204
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDVecMultExpr.h:232
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:220
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:138
Header file for the Computation base class.
Header file for the reset shim.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:145
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:158
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDVecMultExpr.h:225
Headerfile for the generic max algorithm.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:203
Header file for the DisableIf class template.
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:217
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:131
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:372
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:362
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:208
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1147
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:261
Header file for the HasSIMDAdd type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:340
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:318
Header file for all SIMD functionality.
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDVecMultExpr.h:238
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:205
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:104
Constraint on the data type.
Header file for the exception macros of the math module.
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1179
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:211
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:308
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:129
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:130
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
Header file for BLAS general matrix/vector multiplication functions (gemv)
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:128
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:132
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:328
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:352
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:384
Header file for the MatVecMultExpr base class.
Constraint on the data type.
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:206
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:133
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:247
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:385
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423