35#ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
126 :
public MatVecMultExpr< DenseVector< TDMatDVecMultExpr<MT,VT>, false > >
142 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
143 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
148 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
157 template<
typename T1 >
167 template<
typename T1,
typename T2,
typename T3 >
168 static constexpr bool UseBlasKernel_v =
170 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
171 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
172 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
174 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
175 IsBLASCompatible_v< ElementType_t<T1> > &&
176 IsBLASCompatible_v< ElementType_t<T2> > &&
177 IsBLASCompatible_v< ElementType_t<T3> > &&
189 template<
typename T1,
typename T2,
typename T3 >
190 static constexpr bool UseVectorizedDefaultKernel_v =
191 ( useOptimizedKernels &&
193 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
194 IsSIMDCombinable_v< ElementType_t<T1>
233 ( !IsDiagonal_v<MT> &&
234 MT::simdEnabled && VT::simdEnabled &&
235 HasSIMDAdd_v<MET,VET> &&
236 HasSIMDMult_v<MET,VET> );
271 if( IsDiagonal_v<MT> )
273 return mat_(index,index) *
vec_[index];
275 else if( IsLower_v<MT> && ( index + 8UL <
mat_.rows() ) )
277 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
281 else if( IsUpper_v<MT> && ( index > 8UL ) )
283 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
284 const size_t n (
mat_.columns() -
begin );
303 if( index >=
mat_.rows() ) {
306 return (*
this)[index];
315 inline size_t size() const noexcept {
346 template<
typename T >
347 inline bool canAlias(
const T* alias )
const noexcept {
348 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
358 template<
typename T >
359 inline bool isAliased(
const T* alias )
const noexcept {
360 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
370 return mat_.isAligned() &&
vec_.isAligned();
384 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
385 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
408 template<
typename VT1 >
415 if( rhs.
mat_.rows() == 0UL ) {
418 else if( rhs.
mat_.columns() == 0UL ||
419 ( IsStrictlyTriangular_v<MT> && rhs.
mat_.columns() == 1UL ) ) {
432 TDMatDVecMultExpr::selectAssignKernel( *lhs, A, x );
448 template<
typename VT1
451 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
453 if( ( IsDiagonal_v<MT1> ) ||
455 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
456 selectSmallAssignKernel( y, A, x );
458 selectBlasAssignKernel( y, A, x );
477 template<
typename VT1
480 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
482 const size_t M( A.rows() );
483 const size_t N( A.columns() );
485 if( IsStrictlyLower_v<MT1> ) {
489 if( !IsUpper_v<MT1> )
491 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
492 y[i] = A(i,0UL) * x[0UL];
496 for(
size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
498 if( IsDiagonal_v<MT1> )
500 y[j] = A(j,j) * x[j];
504 const size_t ibegin( ( IsLower_v<MT1> )
505 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
507 const size_t iend( ( IsUpper_v<MT1> )
508 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
512 const size_t inum( iend - ibegin );
513 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
516 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
517 y[i ] += A(i ,j) * x[j];
518 y[i+1UL] += A(i+1UL,j) * x[j];
521 y[ipos] += A(ipos,j) * x[j];
523 if( IsUpper_v<MT1> ) {
524 y[iend] = A(iend,j) * x[j];
529 if( IsStrictlyUpper_v<MT1> ) {
550 template<
typename VT1
553 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
554 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
556 selectDefaultAssignKernel( y, A, x );
575 template<
typename VT1
578 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
579 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
581 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
583 const size_t M( A.rows() );
584 const size_t N( A.columns() );
593 const size_t jbegin( ( IsUpper_v<MT1> )
594 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
596 const size_t jend( ( IsLower_v<MT1> )
597 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
602 SIMDType xmm1( A.load(i ,jbegin) * x1 );
611 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
613 xmm1 += A.load(i ,j) * x1;
615 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
616 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
617 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
618 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
619 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
620 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
635 const size_t jbegin( ( IsUpper_v<MT1> )
636 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
638 const size_t jend( ( IsLower_v<MT1> )
639 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
644 SIMDType xmm1( A.load(i ,jbegin) * x1 );
649 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
651 xmm1 += A.load(i ,j) * x1;
653 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
654 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
665 const size_t jbegin( ( IsUpper_v<MT1> )
666 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
668 const size_t jend( ( IsLower_v<MT1> )
669 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
674 SIMDType xmm1( A.load(i ,jbegin) * x1 );
678 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
680 xmm1 += A.load(i ,j) * x1;
682 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
692 const size_t jbegin( ( IsUpper_v<MT1> )
693 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
695 const size_t jend( ( IsLower_v<MT1> )
696 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
701 SIMDType xmm1( A.load(i ,jbegin) * x1 );
704 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
706 xmm1 += A.load(i ,j) * x1;
716 const size_t jbegin( ( IsUpper_v<MT1> )
717 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
719 const size_t jend( ( IsLower_v<MT1> )
720 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
724 SIMDType xmm1( A.load(i,jbegin) *
set( x[jbegin] ) );
726 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
727 xmm1 += A.load(i,j) *
set( x[j] );
733 for( ; remainder && i<M; ++i )
735 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
736 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
741 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
742 value += A(i,j) * x[j];
765 template<
typename VT1
768 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
769 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
771 selectDefaultAssignKernel( y, A, x );
790 template<
typename VT1
793 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
794 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
796 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
798 const size_t M( A.rows() );
799 const size_t N( A.columns() );
801 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
802 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
808 for(
size_t ii=0U; ii<M; ii+=iblock ) {
809 for(
size_t jj=0UL; jj<N; jj+=jblock )
811 const size_t jend(
min( jj+jblock, N ) );
812 const size_t itmp(
min( ii+iblock, M ) );
813 const size_t iend( ( IsUpper_v<MT1> )
814 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
820 size_t i( ( IsLower_v<MT1> )
827 SIMDType xmm1( A.load(i ,jj) * x1 );
836 for(
size_t j=jj+1UL; j<jend; ++j ) {
838 xmm1 += A.load(i ,j) * x1;
840 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
841 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
842 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
843 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
844 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
845 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
848 y.store( i , y.load(i ) + xmm1 );
861 SIMDType xmm1( A.load(i ,jj) * x1 );
866 for(
size_t j=jj+1UL; j<jend; ++j ) {
868 xmm1 += A.load(i ,j) * x1;
870 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
871 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
874 y.store( i , y.load(i ) + xmm1 );
883 SIMDType xmm1( A.load(i ,jj) * x1 );
887 for(
size_t j=jj+1UL; j<jend; ++j ) {
889 xmm1 += A.load(i ,j) * x1;
891 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
894 y.store( i , y.load(i ) + xmm1 );
902 SIMDType xmm1( A.load(i ,jj) * x1 );
905 for(
size_t j=jj+1UL; j<jend; ++j ) {
907 xmm1 += A.load(i ,j) * x1;
911 y.store( i , y.load(i ) + xmm1 );
919 for(
size_t j=jj+1UL; j<jend; ++j ) {
920 xmm1 += A.load(i,j) *
set( x[j] );
923 y.store( i, y.load(i) + xmm1 );
926 for( ; remainder && i<iend; ++i )
930 for(
size_t j=jj+1UL; j<jend; ++j ) {
931 value += A(i,j) * x[j];
956 template<
typename VT1
959 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
960 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
962 selectLargeAssignKernel( y, A, x );
968#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
982 template<
typename VT1
985 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
986 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
988 using ET = ElementType_t<VT1>;
990 if( IsTriangular_v<MT1> ) {
992 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
995 gemv( y, A, x, ET(1), ET(0) );
1015 template<
typename VT1 >
1016 friend inline void assign( SparseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1027 assign( *lhs, tmp );
1045 template<
typename VT1 >
1046 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1052 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1053 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1065 TDMatDVecMultExpr::selectAddAssignKernel( *lhs, A, x );
1081 template<
typename VT1
1084 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1086 if( ( IsDiagonal_v<MT1> ) ||
1088 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1089 selectSmallAddAssignKernel( y, A, x );
1091 selectBlasAddAssignKernel( y, A, x );
1110 template<
typename VT1
1113 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1115 const size_t M( A.rows() );
1116 const size_t N( A.columns() );
1118 for(
size_t j=0UL; j<N; ++j )
1120 if( IsDiagonal_v<MT1> )
1122 y[j] += A(j,j) * x[j];
1126 const size_t ibegin( ( IsLower_v<MT1> )
1127 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1129 const size_t iend( ( IsUpper_v<MT1> )
1130 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1134 const size_t inum( iend - ibegin );
1135 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
1138 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1139 y[i ] += A(i ,j) * x[j];
1140 y[i+1UL] += A(i+1UL,j) * x[j];
1143 y[ipos] += A(ipos,j) * x[j];
1165 template<
typename VT1
1168 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1169 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1171 selectDefaultAddAssignKernel( y, A, x );
1190 template<
typename VT1
1193 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1194 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1196 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1198 const size_t M( A.rows() );
1199 const size_t N( A.columns() );
1208 const size_t jbegin( ( IsUpper_v<MT1> )
1209 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1211 const size_t jend( ( IsLower_v<MT1> )
1212 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1225 for(
size_t j=jbegin; j<jend; ++j ) {
1227 xmm1 += A.load(i ,j) * x1;
1228 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1229 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1230 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1231 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
1232 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
1233 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
1234 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
1237 y.store( i , xmm1 );
1249 const size_t jbegin( ( IsUpper_v<MT1> )
1250 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1252 const size_t jend( ( IsLower_v<MT1> )
1253 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1262 for(
size_t j=jbegin; j<jend; ++j ) {
1264 xmm1 += A.load(i ,j) * x1;
1265 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1266 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1267 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1270 y.store( i , xmm1 );
1278 const size_t jbegin( ( IsUpper_v<MT1> )
1279 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1281 const size_t jend( ( IsLower_v<MT1> )
1282 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1290 for(
size_t j=jbegin; j<jend; ++j ) {
1292 xmm1 += A.load(i ,j) * x1;
1293 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1294 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1297 y.store( i , xmm1 );
1304 const size_t jbegin( ( IsUpper_v<MT1> )
1305 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1307 const size_t jend( ( IsLower_v<MT1> )
1308 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1315 for(
size_t j=jbegin; j<jend; ++j ) {
1317 xmm1 += A.load(i ,j) * x1;
1321 y.store( i , xmm1 );
1327 const size_t jbegin( ( IsUpper_v<MT1> )
1328 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1330 const size_t jend( ( IsLower_v<MT1> )
1331 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1337 for(
size_t j=jbegin; j<jend; ++j ) {
1338 xmm1 += A.load(i,j) *
set( x[j] );
1344 for( ; remainder && i<M; ++i )
1346 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
1347 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
1352 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
1353 value += A(i,j) * x[j];
1376 template<
typename VT1
1379 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1380 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1382 selectDefaultAddAssignKernel( y, A, x );
1401 template<
typename VT1
1404 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1405 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1407 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1409 const size_t M( A.rows() );
1410 const size_t N( A.columns() );
1412 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1413 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1417 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1418 for(
size_t jj=0UL; jj<N; jj+=jblock )
1420 const size_t jend(
min( jj+jblock, N ) );
1421 const size_t itmp(
min( ii+iblock, M ) );
1422 const size_t iend( ( IsUpper_v<MT1> )
1423 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
1429 size_t i( ( IsLower_v<MT1> )
1436 SIMDType xmm1( A.load(i ,jj) * x1 );
1445 for(
size_t j=jj+1UL; j<jend; ++j ) {
1447 xmm1 += A.load(i ,j) * x1;
1448 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1449 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1450 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1451 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
1452 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
1453 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
1454 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
1457 y.store( i , y.load(i ) + xmm1 );
1470 SIMDType xmm1( A.load(i ,jj) * x1 );
1475 for(
size_t j=jj+1UL; j<jend; ++j ) {
1477 xmm1 += A.load(i ,j) * x1;
1478 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1479 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1480 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
1483 y.store( i , y.load(i ) + xmm1 );
1492 SIMDType xmm1( A.load(i ,jj) * x1 );
1496 for(
size_t j=jj+1UL; j<jend; ++j ) {
1498 xmm1 += A.load(i ,j) * x1;
1499 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
1500 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
1503 y.store( i , y.load(i ) + xmm1 );
1511 SIMDType xmm1( A.load(i ,jj) * x1 );
1514 for(
size_t j=jj+1UL; j<jend; ++j ) {
1516 xmm1 += A.load(i ,j) * x1;
1520 y.store( i , y.load(i ) + xmm1 );
1528 for(
size_t j=jj+1UL; j<jend; ++j ) {
1529 xmm1 += A.load(i,j) *
set( x[j] );
1532 y.store( i, y.load(i) + xmm1 );
1535 for( ; remainder && i<iend; ++i )
1539 for(
size_t j=jj+1UL; j<jend; ++j ) {
1540 value += A(i,j) * x[j];
1565 template<
typename VT1
1568 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1569 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1571 selectLargeAddAssignKernel( y, A, x );
1577#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1591 template<
typename VT1
1594 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1595 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1597 using ET = ElementType_t<VT1>;
1599 if( IsTriangular_v<MT1> ) {
1600 ResultType_t<VT1> tmp(
serial( x ) );
1601 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1602 addAssign( y, tmp );
1605 gemv( y, A, x, ET(1), ET(1) );
1629 template<
typename VT1 >
1630 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1636 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1637 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1649 TDMatDVecMultExpr::selectSubAssignKernel( *lhs, A, x );
1665 template<
typename VT1
1668 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1670 if( ( IsDiagonal_v<MT1> ) ||
1672 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1673 selectSmallSubAssignKernel( y, A, x );
1675 selectBlasSubAssignKernel( y, A, x );
1694 template<
typename VT1
1697 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1699 const size_t M( A.rows() );
1700 const size_t N( A.columns() );
1702 for(
size_t j=0UL; j<N; ++j )
1704 if( IsDiagonal_v<MT1> )
1706 y[j] -= A(j,j) * x[j];
1710 const size_t ibegin( ( IsLower_v<MT1> )
1711 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1713 const size_t iend( ( IsUpper_v<MT1> )
1714 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1718 const size_t inum( iend - ibegin );
1719 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
1722 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1723 y[i ] -= A(i ,j) * x[j];
1724 y[i+1UL] -= A(i+1UL,j) * x[j];
1727 y[ipos] -= A(ipos,j) * x[j];
1749 template<
typename VT1
1752 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1753 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1755 selectDefaultSubAssignKernel( y, A, x );
1775 template<
typename VT1
1778 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1779 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1781 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1783 const size_t M( A.rows() );
1784 const size_t N( A.columns() );
1793 const size_t jbegin( ( IsUpper_v<MT1> )
1794 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1796 const size_t jend( ( IsLower_v<MT1> )
1797 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1810 for(
size_t j=jbegin; j<jend; ++j ) {
1812 xmm1 -= A.load(i ,j) * x1;
1813 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1814 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1815 xmm4 -= A.load(i+
SIMDSIZE*3UL,j) * x1;
1816 xmm5 -= A.load(i+
SIMDSIZE*4UL,j) * x1;
1817 xmm6 -= A.load(i+
SIMDSIZE*5UL,j) * x1;
1818 xmm7 -= A.load(i+
SIMDSIZE*6UL,j) * x1;
1819 xmm8 -= A.load(i+
SIMDSIZE*7UL,j) * x1;
1822 y.store( i , xmm1 );
1834 const size_t jbegin( ( IsUpper_v<MT1> )
1835 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1837 const size_t jend( ( IsLower_v<MT1> )
1838 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1847 for(
size_t j=jbegin; j<jend; ++j ) {
1849 xmm1 -= A.load(i ,j) * x1;
1850 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1851 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1852 xmm4 -= A.load(i+
SIMDSIZE*3UL,j) * x1;
1855 y.store( i , xmm1 );
1863 const size_t jbegin( ( IsUpper_v<MT1> )
1864 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1866 const size_t jend( ( IsLower_v<MT1> )
1867 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1875 for(
size_t j=jbegin; j<jend; ++j ) {
1877 xmm1 -= A.load(i ,j) * x1;
1878 xmm2 -= A.load(i+
SIMDSIZE ,j) * x1;
1879 xmm3 -= A.load(i+
SIMDSIZE*2UL,j) * x1;
1882 y.store( i , xmm1 );
1889 const size_t jbegin( ( IsUpper_v<MT1> )
1890 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1892 const size_t jend( ( IsLower_v<MT1> )
1893 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1900 for(
size_t j=jbegin; j<jend; ++j ) {
1902 xmm1 -= A.load(i ,j) * x1;
1906 y.store( i , xmm1 );
1912 const size_t jbegin( ( IsUpper_v<MT1> )
1913 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1915 const size_t jend( ( IsLower_v<MT1> )
1916 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
1922 for(
size_t j=jbegin; j<jend; ++j ) {
1923 xmm1 -= A.load(i,j) *
set( x[j] );
1929 for( ; remainder && i<M; ++i )
1931 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
1932 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
1937 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
1938 value += A(i,j) * x[j];
1961 template<
typename VT1
1964 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1965 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1967 selectDefaultSubAssignKernel( y, A, x );
1987 template<
typename VT1
1990 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1991 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1993 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
1995 const size_t M( A.rows() );
1996 const size_t N( A.columns() );
1998 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
1999 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
2003 for(
size_t ii=0U; ii<M; ii+=iblock ) {
2004 for(
size_t jj=0UL; jj<N; jj+=jblock )
2006 const size_t jend(
min( jj+jblock, N ) );
2007 const size_t itmp(
min( ii+iblock, M ) );
2008 const size_t iend( ( IsUpper_v<MT1> )
2009 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
2015 size_t i( ( IsLower_v<MT1> )
2022 SIMDType xmm1( A.load(i ,jj) * x1 );
2031 for(
size_t j=jj+1UL; j<jend; ++j ) {
2033 xmm1 += A.load(i ,j) * x1;
2034 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2035 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2036 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
2037 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
2038 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
2039 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
2040 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
2043 y.store( i , y.load(i ) - xmm1 );
2056 SIMDType xmm1( A.load(i ,jj) * x1 );
2061 for(
size_t j=jj+1UL; j<jend; ++j ) {
2063 xmm1 += A.load(i ,j) * x1;
2064 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2065 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2066 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
2069 y.store( i , y.load(i ) - xmm1 );
2078 SIMDType xmm1( A.load(i ,jj) * x1 );
2082 for(
size_t j=jj+1UL; j<jend; ++j ) {
2084 xmm1 += A.load(i ,j) * x1;
2085 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
2086 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
2089 y.store( i , y.load(i ) - xmm1 );
2097 SIMDType xmm1( A.load(i ,jj) * x1 );
2100 for(
size_t j=jj+1UL; j<jend; ++j ) {
2102 xmm1 += A.load(i ,j) * x1;
2106 y.store( i , y.load(i ) - xmm1 );
2114 for(
size_t j=jj+1UL; j<jend; ++j ) {
2115 xmm1 += A.load(i,j) *
set( x[j] );
2118 y.store( i, y.load(i) - xmm1 );
2121 for( ; remainder && i<iend; ++i )
2125 for(
size_t j=jj+1UL; j<jend; ++j ) {
2126 value += A(i,j) * x[j];
2151 template<
typename VT1
2154 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2155 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2157 selectLargeSubAssignKernel( y, A, x );
2163#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2177 template<
typename VT1
2180 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2181 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2183 using ET = ElementType_t<VT1>;
2185 if( IsTriangular_v<MT1> ) {
2186 ResultType_t<VT1> tmp(
serial( x ) );
2187 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2188 subAssign( y, tmp );
2191 gemv( y, A, x, ET(-1), ET(1) );
2215 template<
typename VT1 >
2216 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2227 multAssign( *lhs, tmp );
2249 template<
typename VT1 >
2250 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2261 divAssign( *lhs, tmp );
2285 template<
typename VT1 >
2287 -> EnableIf_t< UseSMPAssign_v<VT1> >
2293 if( rhs.mat_.rows() == 0UL ) {
2296 else if( rhs.mat_.columns() == 0UL ||
2297 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
2330 template<
typename VT1 >
2332 -> EnableIf_t< UseSMPAssign_v<VT1> >
2363 template<
typename VT1 >
2365 -> EnableIf_t< UseSMPAssign_v<VT1> >
2371 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2372 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2408 template<
typename VT1 >
2410 -> EnableIf_t< UseSMPAssign_v<VT1> >
2416 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2417 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2453 template<
typename VT1 >
2455 -> EnableIf_t< UseSMPAssign_v<VT1> >
2490 template<
typename VT1 >
2492 -> EnableIf_t< UseSMPAssign_v<VT1> >
2542template<
typename MT
2545class DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >
2546 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false > >
2547 ,
private Computation
2551 using MVM = TDMatDVecMultExpr<MT,VT>;
2552 using RES = ResultType_t<MVM>;
2553 using MRT = ResultType_t<MT>;
2554 using VRT = ResultType_t<VT>;
2555 using MET = ElementType_t<MRT>;
2556 using VET = ElementType_t<VRT>;
2557 using MCT = CompositeType_t<MT>;
2558 using VCT = CompositeType_t<VT>;
2563 static constexpr bool evaluateMatrix =
2564 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2565 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2570 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2578 template<
typename T1 >
2579 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
2586 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2587 static constexpr bool UseBlasKernel_v =
2589 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2590 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2591 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2592 !IsDiagonal_v<T2> &&
2593 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2594 IsBLASCompatible_v< ElementType_t<T1> > &&
2595 IsBLASCompatible_v< ElementType_t<T2> > &&
2596 IsBLASCompatible_v< ElementType_t<T3> > &&
2597 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2598 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2599 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2607 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2608 static constexpr bool UseVectorizedDefaultKernel_v =
2609 ( useOptimizedKernels &&
2610 !IsDiagonal_v<T2> &&
2611 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2612 IsSIMDCombinable_v< ElementType_t<T1>
2616 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2617 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2623 using This = DVecScalarMultExpr<MVM,ST,false>;
2626 using BaseType = VecScalarMultExpr< DenseVector<This,false> >;
2631 using SIMDType = SIMDTrait_t<ElementType>;
2636 using LeftOperand =
const TDMatDVecMultExpr<MT,VT>;
2642 using LT = If_t< evaluateMatrix, const MRT, MCT >;
2645 using RT = If_t< evaluateVector, const VRT, VCT >;
2651 ( !IsDiagonal_v<MT> &&
2652 MT::simdEnabled && VT::simdEnabled &&
2653 IsSIMDCombinable_v<MET,VET,ST> &&
2654 HasSIMDAdd_v<MET,VET> &&
2655 HasSIMDMult_v<MET,VET> );
2659 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
2699 if( index >=
vector_.size() ) {
2702 return (*
this)[index];
2711 inline size_t size()
const {
2742 template<
typename T >
2743 inline bool canAlias(
const T* alias )
const {
2744 return vector_.canAlias( alias );
2754 template<
typename T >
2755 inline bool isAliased(
const T* alias )
const {
2756 return vector_.isAliased( alias );
2776 LeftOperand_t<MVM> A(
vector_.leftOperand() );
2780 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2781 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2782 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2804 template<
typename VT1 >
2805 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2811 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
2812 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
2814 if( left.rows() == 0UL ) {
2817 else if( left.columns() == 0UL ||
2818 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
2831 DVecScalarMultExpr::selectAssignKernel( *lhs, A, x, rhs.scalar_ );
2846 template<
typename VT1
2850 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2852 if( ( IsDiagonal_v<MT1> ) ||
2853 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2854 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2855 selectSmallAssignKernel( y, A, x, scalar );
2857 selectBlasAssignKernel( y, A, x, scalar );
2875 template<
typename VT1
2879 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2881 const size_t M( A.rows() );
2882 const size_t N( A.columns() );
2884 if( IsStrictlyLower_v<MT1> ) {
2888 if( !IsUpper_v<MT1> )
2890 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<M; ++i ) {
2891 y[i] = A(i,0UL) * x[0UL];
2895 for(
size_t j=( IsUpper_v<MT1> && !IsStrictlyUpper_v<MT1> ? 0UL : 1UL ); j<N; ++j )
2897 if( IsDiagonal_v<MT1> )
2899 y[j] = A(j,j) * x[j] * scalar;
2903 const size_t ibegin( ( IsLower_v<MT1> )
2904 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
2906 const size_t iend( ( IsUpper_v<MT1> )
2907 ?( IsStrictlyUpper_v<MT1> ? j-1UL : j )
2911 const size_t inum( iend - ibegin );
2912 const size_t ipos( ibegin +
prevMultiple( inum, 2UL ) );
2915 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2916 y[i ] += A(i ,j) * x[j];
2917 y[i+1UL] += A(i+1UL,j) * x[j];
2920 y[ipos] += A(ipos,j) * x[j];
2922 if( IsUpper_v<MT1> ) {
2923 y[iend] = A(iend,j) * x[j];
2928 if( IsStrictlyUpper_v<MT1> ) {
2932 if( !IsDiagonal_v<MT1> )
2934 const size_t iend( IsStrictlyUpper_v<MT1> ? M-1UL : M );
2935 for(
size_t i=( IsStrictlyLower_v<MT1> ? 1UL : 0UL ); i<iend; ++i ) {
2956 template<
typename VT1
2960 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2961 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2963 selectDefaultAssignKernel( y, A, x, scalar );
2981 template<
typename VT1
2985 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2986 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
2988 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
2990 const size_t M( A.rows() );
2991 const size_t N( A.columns() );
2996 const SIMDType factor(
set( scalar ) );
3002 const size_t jbegin( ( IsUpper_v<MT1> )
3003 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3005 const size_t jend( ( IsLower_v<MT1> )
3006 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3010 SIMDType x1(
set( x[jbegin] ) );
3011 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3012 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3013 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3014 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
3015 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jbegin) * x1 );
3016 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jbegin) * x1 );
3017 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jbegin) * x1 );
3018 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jbegin) * x1 );
3020 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3022 xmm1 += A.load(i ,j) * x1;
3023 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3024 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3025 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3026 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3027 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3028 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3029 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3032 y.store( i , xmm1*factor );
3033 y.store( i+
SIMDSIZE , xmm2*factor );
3034 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
3035 y.store( i+
SIMDSIZE*3UL, xmm4*factor );
3036 y.store( i+
SIMDSIZE*4UL, xmm5*factor );
3037 y.store( i+
SIMDSIZE*5UL, xmm6*factor );
3038 y.store( i+
SIMDSIZE*6UL, xmm7*factor );
3039 y.store( i+
SIMDSIZE*7UL, xmm8*factor );
3044 const size_t jbegin( ( IsUpper_v<MT1> )
3045 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3047 const size_t jend( ( IsLower_v<MT1> )
3048 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3052 SIMDType x1(
set( x[jbegin] ) );
3053 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3054 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3055 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3056 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
3058 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3060 xmm1 += A.load(i ,j) * x1;
3061 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3062 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3063 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3066 y.store( i , xmm1*factor );
3067 y.store( i+
SIMDSIZE , xmm2*factor );
3068 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
3069 y.store( i+
SIMDSIZE*3UL, xmm4*factor );
3074 const size_t jbegin( ( IsUpper_v<MT1> )
3075 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3077 const size_t jend( ( IsLower_v<MT1> )
3078 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3082 SIMDType x1(
set( x[jbegin] ) );
3083 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3084 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3085 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3087 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3089 xmm1 += A.load(i ,j) * x1;
3090 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3091 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3094 y.store( i , xmm1*factor );
3095 y.store( i+
SIMDSIZE , xmm2*factor );
3096 y.store( i+
SIMDSIZE*2UL, xmm3*factor );
3101 const size_t jbegin( ( IsUpper_v<MT1> )
3102 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3104 const size_t jend( ( IsLower_v<MT1> )
3105 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3109 SIMDType x1(
set( x[jbegin] ) );
3110 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3111 SIMDType xmm2( A.load(i+
SIMDSIZE,jbegin) * x1 );
3113 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3115 xmm1 += A.load(i ,j) * x1;
3119 y.store( i , xmm1*factor );
3120 y.store( i+
SIMDSIZE, xmm2*factor );
3125 const size_t jbegin( ( IsUpper_v<MT1> )
3126 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3128 const size_t jend( ( IsLower_v<MT1> )
3129 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3133 SIMDType xmm1( A.load(i,jbegin) *
set( x[jbegin] ) );
3135 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3136 xmm1 += A.load(i,j) *
set( x[j] );
3139 y.store( i, xmm1*factor );
3142 for( ; remainder && i<M; ++i )
3144 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
3145 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
3150 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3151 value += A(i,j) * x[j];
3154 y[i] = value * scalar;
3173 template<
typename VT1
3177 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3178 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3180 selectDefaultAssignKernel( y, A, x, scalar );
3198 template<
typename VT1
3202 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3203 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3205 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3207 const size_t M( A.rows() );
3208 const size_t N( A.columns() );
3210 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3211 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3215 const SIMDType factor(
set( scalar ) );
3219 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3220 for(
size_t jj=0UL; jj<N; jj+=jblock )
3222 const size_t jend(
min( jj+jblock, N ) );
3223 const size_t itmp(
min( ii+iblock, M ) );
3224 const size_t iend( ( IsUpper_v<MT1> )
3225 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3231 size_t i( ( IsLower_v<MT1> )
3237 SIMDType x1(
set( x[jj] ) );
3238 SIMDType xmm1( A.load(i ,jj) * x1 );
3239 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3240 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3241 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
3242 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jj) * x1 );
3243 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jj) * x1 );
3244 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jj) * x1 );
3245 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jj) * x1 );
3247 for(
size_t j=jj+1UL; j<jend; ++j ) {
3249 xmm1 += A.load(i ,j) * x1;
3250 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3251 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3252 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3253 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3254 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3255 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3256 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3259 y.store( i , y.load(i ) + xmm1*factor );
3271 SIMDType x1(
set( x[jj] ) );
3272 SIMDType xmm1( A.load(i ,jj) * x1 );
3273 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3274 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3275 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
3277 for(
size_t j=jj+1UL; j<jend; ++j ) {
3279 xmm1 += A.load(i ,j) * x1;
3280 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3281 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3282 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3285 y.store( i , y.load(i ) + xmm1*factor );
3293 SIMDType x1(
set( x[jj] ) );
3294 SIMDType xmm1( A.load(i ,jj) * x1 );
3295 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3296 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3298 for(
size_t j=jj+1UL; j<jend; ++j ) {
3300 xmm1 += A.load(i ,j) * x1;
3301 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3302 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3305 y.store( i , y.load(i ) + xmm1*factor );
3312 SIMDType x1(
set( x[jj] ) );
3313 SIMDType xmm1( A.load(i ,jj) * x1 );
3314 SIMDType xmm2( A.load(i+
SIMDSIZE,jj) * x1 );
3316 for(
size_t j=jj+1UL; j<jend; ++j ) {
3318 xmm1 += A.load(i ,j) * x1;
3322 y.store( i , y.load(i ) + xmm1*factor );
3328 SIMDType xmm1( A.load(i,jj) *
set( x[jj] ) );
3330 for(
size_t j=jj+1UL; j<jend; ++j ) {
3331 xmm1 += A.load(i,j) *
set( x[j] );
3334 y.store( i, y.load(i) + xmm1*factor );
3337 for( ; remainder && i<iend; ++i )
3341 for(
size_t j=jj+1UL; j<jend; ++j ) {
3342 value += A(i,j) * x[j];
3345 y[i] += value * scalar;
3366 template<
typename VT1
3370 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3371 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3373 selectLargeAssignKernel( y, A, x, scalar );
3378#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3392 template<
typename VT1
3396 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3397 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3399 using ET = ElementType_t<VT1>;
3401 if( IsTriangular_v<MT1> ) {
3402 assign( y, scalar * x );
3403 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3406 gemv( y, A, x,
ET(scalar),
ET(0) );
3424 template<
typename VT1 >
3425 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3436 assign( *lhs, tmp );
3452 template<
typename VT1 >
3453 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3459 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3460 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3462 if( left.rows() == 0UL || left.columns() == 0UL ||
3463 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
3475 DVecScalarMultExpr::selectAddAssignKernel( *lhs, A, x, rhs.scalar_ );
3490 template<
typename VT1
3494 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3496 if( ( IsDiagonal_v<MT1> ) ||
3497 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3498 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3499 selectSmallAddAssignKernel( y, A, x, scalar );
3501 selectBlasAddAssignKernel( y, A, x, scalar );
3519 template<
typename VT1
3523 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3525 y.addAssign( A * x * scalar );
3543 template<
typename VT1
3547 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3548 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3550 selectDefaultAddAssignKernel( y, A, x, scalar );
3569 template<
typename VT1
3573 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3574 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3576 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3578 const size_t M( A.rows() );
3579 const size_t N( A.columns() );
3584 const SIMDType factor(
set( scalar ) );
3590 const size_t jbegin( ( IsUpper_v<MT1> )
3591 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3593 const size_t jend( ( IsLower_v<MT1> )
3594 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3598 SIMDType x1(
set( x[jbegin] ) );
3599 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3600 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3601 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3602 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
3603 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jbegin) * x1 );
3604 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jbegin) * x1 );
3605 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jbegin) * x1 );
3606 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jbegin) * x1 );
3608 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3610 xmm1 += A.load(i ,j) * x1;
3611 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3612 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3613 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3614 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3615 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3616 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3617 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3620 y.store( i , y.load(i ) + xmm1*factor );
3632 const size_t jbegin( ( IsUpper_v<MT1> )
3633 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3635 const size_t jend( ( IsLower_v<MT1> )
3636 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3640 SIMDType x1(
set( x[jbegin] ) );
3641 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3642 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3643 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3644 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
3646 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3648 xmm1 += A.load(i ,j) * x1;
3649 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3650 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3651 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3654 y.store( i , y.load(i ) + xmm1*factor );
3662 const size_t jbegin( ( IsUpper_v<MT1> )
3663 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3665 const size_t jend( ( IsLower_v<MT1> )
3666 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3670 SIMDType x1(
set( x[jbegin] ) );
3671 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3672 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
3673 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
3675 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3677 xmm1 += A.load(i ,j) * x1;
3678 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3679 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3682 y.store( i , y.load(i ) + xmm1*factor );
3689 const size_t jbegin( ( IsUpper_v<MT1> )
3690 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3692 const size_t jend( ( IsLower_v<MT1> )
3693 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3697 SIMDType x1(
set( x[jbegin] ) );
3698 SIMDType xmm1( A.load(i ,jbegin) * x1 );
3699 SIMDType xmm2( A.load(i+
SIMDSIZE,jbegin) * x1 );
3701 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3703 xmm1 += A.load(i ,j) * x1;
3707 y.store( i , y.load(i ) + xmm1*factor );
3713 const size_t jbegin( ( IsUpper_v<MT1> )
3714 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
3716 const size_t jend( ( IsLower_v<MT1> )
3717 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
3721 SIMDType xmm1( A.load(i,jbegin) *
set( x[jbegin] ) );
3723 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3724 xmm1 += A.load(i,j) *
set( x[j] );
3727 y.store( i, y.load(i) + xmm1*factor );
3730 for( ; remainder && i<M; ++i )
3732 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
3733 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
3738 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
3739 value += A(i,j) * x[j];
3742 y[i] += value * scalar;
3761 template<
typename VT1
3765 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3766 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3768 selectDefaultAddAssignKernel( y, A, x, scalar );
3787 template<
typename VT1
3791 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3792 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3794 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
3796 const size_t M( A.rows() );
3797 const size_t N( A.columns() );
3799 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
3800 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3804 const SIMDType factor(
set( scalar ) );
3806 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3807 for(
size_t jj=0UL; jj<N; jj+=jblock )
3809 const size_t jend(
min( jj+jblock, N ) );
3810 const size_t itmp(
min( ii+iblock, M ) );
3811 const size_t iend( ( IsUpper_v<MT1> )
3812 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
3818 size_t i( ( IsLower_v<MT1> )
3824 SIMDType x1(
set( x[jj] ) );
3825 SIMDType xmm1( A.load(i ,jj) * x1 );
3826 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3827 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3828 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
3829 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jj) * x1 );
3830 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jj) * x1 );
3831 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jj) * x1 );
3832 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jj) * x1 );
3834 for(
size_t j=jj+1UL; j<jend; ++j ) {
3836 xmm1 += A.load(i ,j) * x1;
3837 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3838 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3839 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3840 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
3841 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
3842 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
3843 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
3846 y.store( i , y.load(i ) + xmm1*factor );
3858 SIMDType x1(
set( x[jj] ) );
3859 SIMDType xmm1( A.load(i ,jj) * x1 );
3860 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3861 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3862 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
3864 for(
size_t j=jj+1UL; j<jend; ++j ) {
3866 xmm1 += A.load(i ,j) * x1;
3867 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3868 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3869 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
3872 y.store( i , y.load(i ) + xmm1*factor );
3880 SIMDType x1(
set( x[jj] ) );
3881 SIMDType xmm1( A.load(i ,jj) * x1 );
3882 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
3883 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
3885 for(
size_t j=jj+1UL; j<jend; ++j ) {
3887 xmm1 += A.load(i ,j) * x1;
3888 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
3889 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
3892 y.store( i , y.load(i ) + xmm1*factor );
3899 SIMDType x1(
set( x[jj] ) );
3900 SIMDType xmm1( A.load(i ,jj) * x1 );
3901 SIMDType xmm2( A.load(i+
SIMDSIZE,jj) * x1 );
3903 for(
size_t j=jj+1UL; j<jend; ++j ) {
3905 xmm1 += A.load(i ,j) * x1;
3909 y.store( i , y.load(i ) + xmm1*factor );
3915 SIMDType xmm1( A.load(i,jj) *
set( x[jj] ) );
3917 for(
size_t j=jj+1UL; j<jend; ++j ) {
3918 xmm1 += A.load(i,j) *
set( x[j] );
3921 y.store( i, y.load(i) + xmm1*factor );
3924 for( ; remainder && i<iend; ++i )
3928 for(
size_t j=jj+1UL; j<jend; ++j ) {
3929 value += A(i,j) * x[j];
3932 y[i] += value * scalar;
3953 template<
typename VT1
3957 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3958 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3960 selectLargeAddAssignKernel( y, A, x, scalar );
3965#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3979 template<
typename VT1
3983 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3984 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3986 using ET = ElementType_t<VT1>;
3988 if( IsTriangular_v<MT1> ) {
3989 ResultType_t<VT1> tmp(
serial( scalar * x ) );
3990 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3991 addAssign( y, tmp );
3994 gemv( y, A, x,
ET(scalar),
ET(1) );
4016 template<
typename VT1 >
4017 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4023 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4024 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4026 if( left.rows() == 0UL || left.columns() == 0UL ||
4027 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4039 DVecScalarMultExpr::selectSubAssignKernel( *lhs, A, x, rhs.scalar_ );
4054 template<
typename VT1
4058 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4060 if( ( IsDiagonal_v<MT1> ) ||
4061 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4062 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
4063 selectSmallSubAssignKernel( y, A, x, scalar );
4065 selectBlasSubAssignKernel( y, A, x, scalar );
4083 template<
typename VT1
4087 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4089 y.subAssign( A * x * scalar );
4107 template<
typename VT1
4111 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4112 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4114 selectDefaultSubAssignKernel( y, A, x, scalar );
4133 template<
typename VT1
4137 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4138 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4140 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4142 const size_t M( A.rows() );
4143 const size_t N( A.columns() );
4148 const SIMDType factor(
set( scalar ) );
4154 const size_t jbegin( ( IsUpper_v<MT1> )
4155 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4157 const size_t jend( ( IsLower_v<MT1> )
4158 ?(
min( i+
SIMDSIZE*8UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4162 SIMDType x1(
set( x[jbegin] ) );
4163 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4164 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
4165 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
4166 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
4167 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jbegin) * x1 );
4168 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jbegin) * x1 );
4169 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jbegin) * x1 );
4170 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jbegin) * x1 );
4172 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4174 xmm1 += A.load(i ,j) * x1;
4175 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4176 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4177 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4178 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
4179 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
4180 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
4181 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
4184 y.store( i , y.load(i ) - xmm1*factor );
4196 const size_t jbegin( ( IsUpper_v<MT1> )
4197 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4199 const size_t jend( ( IsLower_v<MT1> )
4200 ?(
min( i+
SIMDSIZE*4UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4204 SIMDType x1(
set( x[jbegin] ) );
4205 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4206 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
4207 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
4208 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jbegin) * x1 );
4210 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4212 xmm1 += A.load(i ,j) * x1;
4213 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4214 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4215 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4218 y.store( i , y.load(i ) - xmm1*factor );
4226 const size_t jbegin( ( IsUpper_v<MT1> )
4227 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4229 const size_t jend( ( IsLower_v<MT1> )
4230 ?(
min( i+
SIMDSIZE*3UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4234 SIMDType x1(
set( x[jbegin] ) );
4235 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4236 SIMDType xmm2( A.load(i+
SIMDSIZE ,jbegin) * x1 );
4237 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jbegin) * x1 );
4239 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4241 xmm1 += A.load(i ,j) * x1;
4242 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4243 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4246 y.store( i , y.load(i ) - xmm1*factor );
4253 const size_t jbegin( ( IsUpper_v<MT1> )
4254 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4256 const size_t jend( ( IsLower_v<MT1> )
4257 ?(
min( i+
SIMDSIZE*2UL, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4261 SIMDType x1(
set( x[jbegin] ) );
4262 SIMDType xmm1( A.load(i ,jbegin) * x1 );
4263 SIMDType xmm2( A.load(i+
SIMDSIZE,jbegin) * x1 );
4265 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4267 xmm1 += A.load(i ,j) * x1;
4271 y.store( i , y.load(i ) - xmm1*factor );
4277 const size_t jbegin( ( IsUpper_v<MT1> )
4278 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
4280 const size_t jend( ( IsLower_v<MT1> )
4281 ?(
min( i+
SIMDSIZE, N ) - ( IsStrictlyLower_v<MT1> ? 1UL : 0UL ) )
4285 SIMDType xmm1( A.load(i,jbegin) *
set( x[jbegin] ) );
4287 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4288 xmm1 += A.load(i,j) *
set( x[j] );
4291 y.store( i, y.load(i) - xmm1*factor );
4294 for( ; remainder && i<M; ++i )
4296 const size_t jbegin( ( IsUpper_v<MT1> )?( i ):( 0UL ) );
4297 const size_t jend( ( IsLower_v<MT1> )?(
min( i+1UL, N ) ):( N ) );
4302 for(
size_t j=jbegin+1UL; j<jend; ++j ) {
4303 value += A(i,j) * x[j];
4306 y[i] -= value * scalar;
4325 template<
typename VT1
4329 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4330 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4332 selectDefaultSubAssignKernel( y, A, x, scalar );
4351 template<
typename VT1
4355 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4356 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4358 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT1> );
4360 const size_t M( A.rows() );
4361 const size_t N( A.columns() );
4363 const size_t iblock( 32768UL /
sizeof(
ElementType ) );
4364 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4368 const SIMDType factor(
set( scalar ) );
4370 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4371 for(
size_t jj=0UL; jj<N; jj+=jblock )
4373 const size_t jend(
min( jj+jblock, N ) );
4374 const size_t itmp(
min( ii+iblock, M ) );
4375 const size_t iend( ( IsUpper_v<MT1> )
4376 ?(
min( itmp, ( IsStrictlyUpper_v<MT1> ? jend-1UL : jend ) ) )
4382 size_t i( ( IsLower_v<MT1> )
4388 SIMDType x1(
set( x[jj] ) );
4389 SIMDType xmm1( A.load(i ,jj) * x1 );
4390 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
4391 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
4392 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
4393 SIMDType xmm5( A.load(i+
SIMDSIZE*4UL,jj) * x1 );
4394 SIMDType xmm6( A.load(i+
SIMDSIZE*5UL,jj) * x1 );
4395 SIMDType xmm7( A.load(i+
SIMDSIZE*6UL,jj) * x1 );
4396 SIMDType xmm8( A.load(i+
SIMDSIZE*7UL,jj) * x1 );
4398 for(
size_t j=jj+1UL; j<jend; ++j ) {
4400 xmm1 += A.load(i ,j) * x1;
4401 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4402 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4403 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4404 xmm5 += A.load(i+
SIMDSIZE*4UL,j) * x1;
4405 xmm6 += A.load(i+
SIMDSIZE*5UL,j) * x1;
4406 xmm7 += A.load(i+
SIMDSIZE*6UL,j) * x1;
4407 xmm8 += A.load(i+
SIMDSIZE*7UL,j) * x1;
4410 y.store( i , y.load(i ) - xmm1*factor );
4422 SIMDType x1(
set( x[jj] ) );
4423 SIMDType xmm1( A.load(i ,jj) * x1 );
4424 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
4425 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
4426 SIMDType xmm4( A.load(i+
SIMDSIZE*3UL,jj) * x1 );
4428 for(
size_t j=jj+1UL; j<jend; ++j ) {
4430 xmm1 += A.load(i ,j) * x1;
4431 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4432 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4433 xmm4 += A.load(i+
SIMDSIZE*3UL,j) * x1;
4436 y.store( i , y.load(i ) - xmm1*factor );
4444 SIMDType x1(
set( x[jj] ) );
4445 SIMDType xmm1( A.load(i ,jj) * x1 );
4446 SIMDType xmm2( A.load(i+
SIMDSIZE ,jj) * x1 );
4447 SIMDType xmm3( A.load(i+
SIMDSIZE*2UL,jj) * x1 );
4449 for(
size_t j=jj+1UL; j<jend; ++j ) {
4451 xmm1 += A.load(i ,j) * x1;
4452 xmm2 += A.load(i+
SIMDSIZE ,j) * x1;
4453 xmm3 += A.load(i+
SIMDSIZE*2UL,j) * x1;
4456 y.store( i , y.load(i ) - xmm1*factor );
4463 SIMDType x1(
set( x[jj] ) );
4464 SIMDType xmm1( A.load(i ,jj) * x1 );
4465 SIMDType xmm2( A.load(i+
SIMDSIZE,jj) * x1 );
4467 for(
size_t j=jj+1UL; j<jend; ++j ) {
4469 xmm1 += A.load(i ,j) * x1;
4473 y.store( i , y.load(i ) - xmm1*factor );
4479 SIMDType xmm1( A.load(i,jj) *
set( x[jj] ) );
4481 for(
size_t j=jj+1UL; j<jend; ++j ) {
4482 xmm1 += A.load(i,j) *
set( x[j] );
4485 y.store( i, y.load(i) - xmm1*factor );
4488 for( ; remainder && i<iend; ++i )
4492 for(
size_t j=jj+1UL; j<jend; ++j ) {
4493 value += A(i,j) * x[j];
4496 y[i] -= value * scalar;
4517 template<
typename VT1
4521 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4522 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4524 selectLargeSubAssignKernel( y, A, x, scalar );
4529#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4543 template<
typename VT1
4547 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4548 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4550 using ET = ElementType_t<VT1>;
4552 if( IsTriangular_v<MT1> ) {
4553 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4554 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4555 subAssign( y, tmp );
4558 gemv( y, A, x,
ET(-scalar),
ET(1) );
4580 template<
typename VT1 >
4581 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4592 multAssign( *lhs, tmp );
4612 template<
typename VT1 >
4613 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4624 divAssign( *lhs, tmp );
4646 template<
typename VT1 >
4648 -> EnableIf_t< UseSMPAssign_v<VT1> >
4654 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4655 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4657 if( left.rows() == 0UL ) {
4660 else if( left.columns() == 0UL ||
4661 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
4692 template<
typename VT1 >
4694 -> EnableIf_t< UseSMPAssign_v<VT1> >
4723 template<
typename VT1 >
4725 -> EnableIf_t< UseSMPAssign_v<VT1> >
4731 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4732 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4734 if( left.rows() == 0UL || left.columns() == 0UL ||
4735 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4769 template<
typename VT1 >
4771 -> EnableIf_t< UseSMPAssign_v<VT1> >
4777 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4778 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4780 if( left.rows() == 0UL || left.columns() == 0UL ||
4781 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4816 template<
typename VT1 >
4818 -> EnableIf_t< UseSMPAssign_v<VT1> >
4851 template<
typename VT1 >
4853 -> EnableIf_t< UseSMPAssign_v<VT1> >
4926template<
typename MT
4928inline decltype(
auto)
4935 if( (*mat).columns() != (*vec).size() ) {
4940 return ReturnType( *mat, *vec );
4955template<
typename MT,
typename VT >
4956struct IsAligned< TDMatDVecMultExpr<MT,VT> >
4957 :
public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Constraint on the transpose flag of vector types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Header file for all SIMD functionality.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense matrix-dense vector multiplications.
Definition: TDMatDVecMultExpr.h:128
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:391
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDMatDVecMultExpr.h:239
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:369
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDMatDVecMultExpr.h:245
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:379
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:254
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:213
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:148
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDMatDVecMultExpr.h:232
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:268
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:218
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:302
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:215
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:347
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:325
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:212
If_t< IsExpression_v< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:221
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:335
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:359
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:135
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:392
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:210
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:224
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:227
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:141
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:136
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:214
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:211
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:315
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatVecMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatVecMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.
Definition: ColumnVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all matrix/vector multiplication expression templates.
Definition: MatVecMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.