35#ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
127 :
public TVecMatMultExpr< DenseVector< TDVecDMatMultExpr<VT,MT>, true > >
142 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
148 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
149 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
158 template<
typename T1 >
168 template<
typename T1,
typename T2,
typename T3 >
169 static constexpr bool UseBlasKernel_v =
171 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
172 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
173 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
176 IsBLASCompatible_v< ElementType_t<T1> > &&
177 IsBLASCompatible_v< ElementType_t<T2> > &&
178 IsBLASCompatible_v< ElementType_t<T3> > &&
190 template<
typename T1,
typename T2,
typename T3 >
191 static constexpr bool UseVectorizedDefaultKernel_v =
192 ( useOptimizedKernels &&
194 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
195 IsSIMDCombinable_v< ElementType_t<T1>
234 ( !IsDiagonal_v<MT> &&
235 VT::simdEnabled && MT::simdEnabled &&
236 HasSIMDAdd_v<VET,MET> &&
237 HasSIMDMult_v<VET,MET> );
272 if( IsDiagonal_v<MT> )
274 return vec_[index] *
mat_(index,index);
276 else if( IsLower_v<MT> && ( index > 8UL ) )
278 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
283 else if( IsUpper_v<MT> && ( index + 8UL <
mat_.rows() ) )
285 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
304 if( index >=
mat_.columns() ) {
307 return (*
this)[index];
316 inline size_t size() const noexcept {
317 return mat_.columns();
347 template<
typename T >
348 inline bool canAlias(
const T* alias )
const noexcept {
349 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
359 template<
typename T >
360 inline bool isAliased(
const T* alias )
const noexcept {
361 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
371 return vec_.isAligned() &&
mat_.isAligned();
385 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
386 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
409 template<
typename VT1 >
416 if( rhs.
mat_.rows() == 0UL ||
417 ( IsStrictlyTriangular_v<MT> && rhs.
mat_.rows() == 1UL ) ) {
421 else if( rhs.
mat_.columns() == 0UL ) {
433 TDVecDMatMultExpr::selectAssignKernel( *lhs, x, A );
449 template<
typename VT1
452 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
454 if( ( IsDiagonal_v<MT1> ) ||
456 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
457 selectSmallAssignKernel( y, x, A );
459 selectBlasAssignKernel( y, x, A );
478 template<
typename VT1
481 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
483 const size_t M( A.rows() );
484 const size_t N( A.columns() );
486 if( IsStrictlyUpper_v<MT1> ) {
490 if( !IsLower_v<MT1> )
492 const size_t jbegin( IsStrictlyUpper_v<MT1> ? 1UL : 0UL );
493 for(
size_t j=jbegin; j<N; ++j ) {
494 y[j] = x[0UL] * A(0UL,j);
498 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
500 if( IsDiagonal_v<MT1> )
502 y[i] = x[i] * A(i,i);
506 const size_t jbegin( ( IsUpper_v<MT1> )
507 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
509 const size_t jend( ( IsLower_v<MT1> )
510 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
514 const size_t jnum( jend - jbegin );
515 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
518 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
519 y[j ] += x[i] * A(i,j );
520 y[j+1UL] += x[i] * A(i,j+1UL);
523 y[jpos] += x[i] * A(i,jpos);
525 if( IsLower_v<MT1> ) {
526 y[jend] = x[i] * A(i,jend);
531 if( IsStrictlyLower_v<MT1> ) {
552 template<
typename VT1
555 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
556 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
558 selectDefaultAssignKernel( y, x, A );
577 template<
typename VT1
580 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
581 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
583 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
585 const size_t M( A.rows() );
586 const size_t N( A.columns() );
595 const size_t ibegin( ( IsLower_v<MT1> )
596 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
598 const size_t iend( ( IsUpper_v<MT1> )
599 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
604 SIMDType xmm1( x1 * A.load(ibegin,j ) );
613 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
615 xmm1 += x1 * A.load(i,j );
617 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
618 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
619 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
620 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
621 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
622 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
637 const size_t ibegin( ( IsLower_v<MT1> )
638 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
640 const size_t iend( ( IsUpper_v<MT1> )
641 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
646 SIMDType xmm1( x1 * A.load(ibegin,j ) );
651 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
653 xmm1 += x1 * A.load(i,j );
655 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
656 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
667 const size_t ibegin( ( IsLower_v<MT1> )
668 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
670 const size_t iend( ( IsUpper_v<MT1> )
671 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
676 SIMDType xmm1( x1 * A.load(ibegin,j ) );
680 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
682 xmm1 += x1 * A.load(i,j );
684 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
694 const size_t ibegin( ( IsLower_v<MT1> )
695 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
697 const size_t iend( ( IsUpper_v<MT1> )
698 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
703 SIMDType xmm1( x1 * A.load(ibegin,j ) );
706 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
708 xmm1 += x1 * A.load(i,j );
718 const size_t ibegin( ( IsLower_v<MT1> )
719 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
721 const size_t iend( ( IsUpper_v<MT1> )
722 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
726 SIMDType xmm1(
set( x[ibegin] ) * A.load(ibegin,j) );
728 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
729 xmm1 +=
set( x[i] ) * A.load(i,j);
735 for( ; remainder && j<N; ++j )
737 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
738 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
743 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
744 value += x[i] * A(i,j);
767 template<
typename VT1
770 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
771 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
773 selectDefaultAssignKernel( y, x, A );
792 template<
typename VT1
795 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
796 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
798 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
800 const size_t M( A.rows() );
801 const size_t N( A.columns() );
803 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
804 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
810 for(
size_t jj=0U; jj<N; jj+=jblock ) {
811 for(
size_t ii=0UL; ii<M; ii+=iblock )
813 const size_t iend(
min( ii+iblock, M ) );
814 const size_t jtmp(
min( jj+jblock, N ) );
815 const size_t jend( ( IsLower_v<MT1> )
816 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
822 size_t j( ( IsUpper_v<MT1> )
829 SIMDType xmm1( x1 * A.load(ii,j ) );
838 for(
size_t i=ii+1UL; i<iend; ++i ) {
840 xmm1 += x1 * A.load(i,j );
842 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
843 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
844 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
845 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
846 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
847 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
850 y.store( j , y.load(j ) + xmm1 );
863 SIMDType xmm1( x1 * A.load(ii,j ) );
868 for(
size_t i=ii+1UL; i<iend; ++i ) {
870 xmm1 += x1 * A.load(i,j );
872 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
873 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
876 y.store( j , y.load(j ) + xmm1 );
885 SIMDType xmm1( x1 * A.load(ii,j ) );
889 for(
size_t i=ii+1UL; i<iend; ++i ) {
891 xmm1 += x1 * A.load(i,j );
893 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
896 y.store( j , y.load(j ) + xmm1 );
904 SIMDType xmm1( x1 * A.load(ii,j ) );
907 for(
size_t i=ii+1UL; i<iend; ++i ) {
909 xmm1 += x1 * A.load(i,j );
913 y.store( j , y.load(j ) + xmm1 );
921 for(
size_t i=ii+1UL; i<iend; ++i ) {
922 xmm1 +=
set( x[i] ) * A.load(i,j);
925 y.store( j, y.load(j) + xmm1 );
928 for( ; remainder && j<jend; ++j )
932 for(
size_t i=ii+1UL; i<iend; ++i ) {
933 value += x[i] * A(i,j);
958 template<
typename VT1
961 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
962 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
964 selectLargeAssignKernel( y, x, A );
970#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
984 template<
typename VT1
987 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
988 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
990 using ET = ElementType_t<VT1>;
992 if( IsTriangular_v<MT1> ) {
994 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
997 gemv( y, x, A, ET(1), ET(0) );
1017 template<
typename VT1 >
1018 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1029 assign( *lhs, tmp );
1047 template<
typename VT1 >
1048 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1054 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1055 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1067 TDVecDMatMultExpr::selectAddAssignKernel( *lhs, x, A );
1083 template<
typename VT1
1086 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1088 if( ( IsDiagonal_v<MT1> ) ||
1090 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1091 selectSmallAddAssignKernel( y, x, A );
1093 selectBlasAddAssignKernel( y, x, A );
1112 template<
typename VT1
1115 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1117 const size_t M( A.rows() );
1118 const size_t N( A.columns() );
1120 for(
size_t i=0UL; i<M; ++i )
1122 if( IsDiagonal_v<MT1> )
1124 y[i] += x[i] * A(i,i);
1128 const size_t jbegin( ( IsUpper_v<MT1> )
1129 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1131 const size_t jend( ( IsLower_v<MT1> )
1132 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1136 const size_t jnum( jend - jbegin );
1137 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
1140 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1141 y[j ] += x[i] * A(i,j );
1142 y[j+1UL] += x[i] * A(i,j+1UL);
1145 y[jpos] += x[i] * A(i,jpos);
1167 template<
typename VT1
1170 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1171 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1173 selectDefaultAddAssignKernel( y, x, A );
1192 template<
typename VT1
1195 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1196 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1198 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1200 const size_t M( A.rows() );
1201 const size_t N( A.columns() );
1210 const size_t ibegin( ( IsLower_v<MT1> )
1211 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1213 const size_t iend( ( IsUpper_v<MT1> )
1214 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1227 for(
size_t i=ibegin; i<iend; ++i ) {
1229 xmm1 += x1 * A.load(i,j );
1230 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1231 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1232 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1233 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1234 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1235 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1236 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1239 y.store( j , xmm1 );
1251 const size_t ibegin( ( IsLower_v<MT1> )
1252 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1254 const size_t iend( ( IsUpper_v<MT1> )
1255 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1264 for(
size_t i=ibegin; i<iend; ++i ) {
1266 xmm1 += x1 * A.load(i,j );
1267 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1268 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1269 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1272 y.store( j , xmm1 );
1280 const size_t ibegin( ( IsLower_v<MT1> )
1281 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1283 const size_t iend( ( IsUpper_v<MT1> )
1284 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1292 for(
size_t i=ibegin; i<iend; ++i ) {
1294 xmm1 += x1 * A.load(i,j );
1295 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1296 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1299 y.store( j , xmm1 );
1306 const size_t ibegin( ( IsLower_v<MT1> )
1307 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1309 const size_t iend( ( IsUpper_v<MT1> )
1310 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1317 for(
size_t i=ibegin; i<iend; ++i ) {
1319 xmm1 += x1 * A.load(i,j );
1323 y.store( j , xmm1 );
1329 const size_t ibegin( ( IsLower_v<MT1> )
1330 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1332 const size_t iend( ( IsUpper_v<MT1> )
1333 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1339 for(
size_t i=ibegin; i<iend; ++i ) {
1340 xmm1 +=
set( x[i] ) * A.load(i,j);
1346 for( ; remainder && j<N; ++j )
1348 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
1349 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
1354 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
1355 value += x[i] * A(i,j);
1378 template<
typename VT1
1381 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1382 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1384 selectDefaultAddAssignKernel( y, x, A );
1403 template<
typename VT1
1406 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1407 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1409 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1411 const size_t M( A.rows() );
1412 const size_t N( A.columns() );
1414 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
1415 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1419 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1420 for(
size_t ii=0UL; ii<M; ii+=iblock )
1422 const size_t iend(
min( ii+iblock, M ) );
1423 const size_t jtmp(
min( jj+jblock, N ) );
1424 const size_t jend( ( IsLower_v<MT1> )
1425 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
1431 size_t j( ( IsUpper_v<MT1> )
1438 SIMDType xmm1( x1 * A.load(ii,j ) );
1447 for(
size_t i=ii+1UL; i<iend; ++i ) {
1449 xmm1 += x1 * A.load(i,j );
1450 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1451 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1452 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1453 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
1454 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
1455 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
1456 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
1459 y.store( j , y.load(j ) + xmm1 );
1472 SIMDType xmm1( x1 * A.load(ii,j ) );
1477 for(
size_t i=ii+1UL; i<iend; ++i ) {
1479 xmm1 += x1 * A.load(i,j );
1480 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1481 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1482 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
1485 y.store( j , y.load(j ) + xmm1 );
1494 SIMDType xmm1( x1 * A.load(ii,j ) );
1498 for(
size_t i=ii+1UL; i<iend; ++i ) {
1500 xmm1 += x1 * A.load(i,j );
1501 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
1502 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
1505 y.store( j , y.load(j ) + xmm1 );
1513 SIMDType xmm1( x1 * A.load(ii,j ) );
1516 for(
size_t i=ii+1UL; i<iend; ++i ) {
1518 xmm1 += x1 * A.load(i,j );
1522 y.store( j , y.load(j ) + xmm1 );
1530 for(
size_t i=ii+1UL; i<iend; ++i ) {
1531 xmm1 +=
set( x[i] ) * A.load(i,j);
1534 y.store( j, y.load(j) + xmm1 );
1537 for( ; remainder && j<jend; ++j )
1541 for(
size_t i=ii+1UL; i<iend; ++i ) {
1542 value += x[i] * A(i,j);
1567 template<
typename VT1
1570 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1571 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1573 selectLargeAddAssignKernel( y, x, A );
1579#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1593 template<
typename VT1
1596 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1597 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1599 using ET = ElementType_t<VT1>;
1601 if( IsTriangular_v<MT1> ) {
1602 ResultType_t<VT1> tmp(
serial( x ) );
1603 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1604 addAssign( y, tmp );
1607 gemv( y, x, A, ET(1), ET(1) );
1631 template<
typename VT1 >
1632 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1638 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1639 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1651 TDVecDMatMultExpr::selectSubAssignKernel( *lhs, x, A );
1667 template<
typename VT1
1670 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1672 if( ( IsDiagonal_v<MT1> ) ||
1674 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1675 selectSmallSubAssignKernel( y, x, A );
1677 selectBlasSubAssignKernel( y, x, A );
1696 template<
typename VT1
1699 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1701 const size_t M( A.rows() );
1702 const size_t N( A.columns() );
1704 for(
size_t i=0UL; i<M; ++i )
1706 if( IsDiagonal_v<MT1> )
1708 y[i] -= x[i] * A(i,i);
1712 const size_t jbegin( ( IsUpper_v<MT1> )
1713 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
1715 const size_t jend( ( IsLower_v<MT1> )
1716 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1720 const size_t jnum( jend - jbegin );
1721 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
1724 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1725 y[j ] -= x[i] * A(i,j );
1726 y[j+1UL] -= x[i] * A(i,j+1UL);
1729 y[jpos] -= x[i] * A(i,jpos);
1751 template<
typename VT1
1754 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1755 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1757 selectDefaultSubAssignKernel( y, x, A );
1777 template<
typename VT1
1780 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1781 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1783 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1785 const size_t M( A.rows() );
1786 const size_t N( A.columns() );
1795 const size_t ibegin( ( IsLower_v<MT1> )
1796 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1798 const size_t iend( ( IsUpper_v<MT1> )
1799 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1812 for(
size_t i=ibegin; i<iend; ++i ) {
1814 xmm1 -= x1 * A.load(i,j );
1815 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1816 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1817 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1818 xmm5 -= x1 * A.load(i,j+
SIMDSIZE*4UL);
1819 xmm6 -= x1 * A.load(i,j+
SIMDSIZE*5UL);
1820 xmm7 -= x1 * A.load(i,j+
SIMDSIZE*6UL);
1821 xmm8 -= x1 * A.load(i,j+
SIMDSIZE*7UL);
1824 y.store( j , xmm1 );
1836 const size_t ibegin( ( IsLower_v<MT1> )
1837 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1839 const size_t iend( ( IsUpper_v<MT1> )
1840 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1849 for(
size_t i=ibegin; i<iend; ++i ) {
1851 xmm1 -= x1 * A.load(i,j );
1852 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1853 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1854 xmm4 -= x1 * A.load(i,j+
SIMDSIZE*3UL);
1857 y.store( j , xmm1 );
1865 const size_t ibegin( ( IsLower_v<MT1> )
1866 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1868 const size_t iend( ( IsUpper_v<MT1> )
1869 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1877 for(
size_t i=ibegin; i<iend; ++i ) {
1879 xmm1 -= x1 * A.load(i,j );
1880 xmm2 -= x1 * A.load(i,j+
SIMDSIZE );
1881 xmm3 -= x1 * A.load(i,j+
SIMDSIZE*2UL);
1884 y.store( j , xmm1 );
1891 const size_t ibegin( ( IsLower_v<MT1> )
1892 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1894 const size_t iend( ( IsUpper_v<MT1> )
1895 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1902 for(
size_t i=ibegin; i<iend; ++i ) {
1904 xmm1 -= x1 * A.load(i,j );
1908 y.store( j , xmm1 );
1914 const size_t ibegin( ( IsLower_v<MT1> )
1915 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
1917 const size_t iend( ( IsUpper_v<MT1> )
1918 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
1924 for(
size_t i=ibegin; i<iend; ++i ) {
1925 xmm1 -=
set( x[i] ) * A.load(i,j);
1931 for( ; remainder && j<N; ++j )
1933 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
1934 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
1939 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
1940 value += x[i] * A(i,j);
1963 template<
typename VT1
1966 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1967 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1969 selectDefaultSubAssignKernel( y, x, A );
1989 template<
typename VT1
1992 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1993 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1995 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
1997 const size_t M( A.rows() );
1998 const size_t N( A.columns() );
2000 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
2001 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
2005 for(
size_t jj=0U; jj<N; jj+=jblock ) {
2006 for(
size_t ii=0UL; ii<M; ii+=iblock )
2008 const size_t iend(
min( ii+iblock, M ) );
2009 const size_t jtmp(
min( jj+jblock, N ) );
2010 const size_t jend( ( IsLower_v<MT1> )
2011 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
2017 size_t j( ( IsUpper_v<MT1> )
2024 SIMDType xmm1( x1 * A.load(ii,j ) );
2033 for(
size_t i=ii+1UL; i<iend; ++i ) {
2035 xmm1 += x1 * A.load(i,j );
2036 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2037 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2038 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2039 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
2040 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
2041 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
2042 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
2045 y.store( j , y.load(j ) - xmm1 );
2058 SIMDType xmm1( x1 * A.load(ii,j ) );
2063 for(
size_t i=ii+1UL; i<iend; ++i ) {
2065 xmm1 += x1 * A.load(i,j );
2066 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2067 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2068 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
2071 y.store( j , y.load(j ) - xmm1 );
2080 SIMDType xmm1( x1 * A.load(ii,j ) );
2084 for(
size_t i=ii+1UL; i<iend; ++i ) {
2086 xmm1 += x1 * A.load(i,j );
2087 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
2088 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
2091 y.store( j , y.load(j ) - xmm1 );
2099 SIMDType xmm1( x1 * A.load(ii,j ) );
2102 for(
size_t i=ii+1UL; i<iend; ++i ) {
2104 xmm1 += x1 * A.load(i,j );
2108 y.store( j , y.load(j ) - xmm1 );
2116 for(
size_t i=ii+1UL; i<iend; ++i ) {
2117 xmm1 +=
set( x[i] ) * A.load(i,j);
2120 y.store( j, y.load(j) - xmm1 );
2123 for( ; remainder && j<jend; ++j )
2127 for(
size_t i=ii+1UL; i<iend; ++i ) {
2128 value += x[i] * A(i,j);
2153 template<
typename VT1
2156 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2157 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2159 selectLargeSubAssignKernel( y, x, A );
2165#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2179 template<
typename VT1
2182 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2183 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2185 using ET = ElementType_t<VT1>;
2187 if( IsTriangular_v<MT1> ) {
2188 ResultType_t<VT1> tmp(
serial( x ) );
2189 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2190 subAssign( y, tmp );
2193 gemv( y, x, A, ET(-1), ET(1) );
2217 template<
typename VT1 >
2218 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2229 multAssign( *lhs, tmp );
2251 template<
typename VT1 >
2252 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2263 divAssign( *lhs, tmp );
2287 template<
typename VT1 >
2289 -> EnableIf_t< UseSMPAssign_v<VT1> >
2295 if( rhs.mat_.rows() == 0UL ||
2296 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2300 else if( rhs.mat_.columns() == 0UL ) {
2332 template<
typename VT1 >
2334 -> EnableIf_t< UseSMPAssign_v<VT1> >
2365 template<
typename VT1 >
2367 -> EnableIf_t< UseSMPAssign_v<VT1> >
2373 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2374 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2410 template<
typename VT1 >
2412 -> EnableIf_t< UseSMPAssign_v<VT1> >
2418 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2419 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2455 template<
typename VT1 >
2457 -> EnableIf_t< UseSMPAssign_v<VT1> >
2492 template<
typename VT1 >
2494 -> EnableIf_t< UseSMPAssign_v<VT1> >
2543template<
typename VT
2546class DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >
2547 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true > >
2548 ,
private Computation
2552 using VMM = TDVecDMatMultExpr<VT,MT>;
2553 using RES = ResultType_t<VMM>;
2554 using VRT = ResultType_t<VT>;
2555 using MRT = ResultType_t<MT>;
2556 using VET = ElementType_t<VRT>;
2557 using MET = ElementType_t<MRT>;
2558 using VCT = CompositeType_t<VT>;
2559 using MCT = CompositeType_t<MT>;
2564 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2569 static constexpr bool evaluateMatrix =
2570 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2571 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2579 template<
typename T1 >
2580 static constexpr bool UseSMPAssign_v = ( evaluateVector || evaluateMatrix );
2587 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2588 static constexpr bool UseBlasKernel_v =
2590 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2591 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2592 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2593 !IsDiagonal_v<T3> &&
2594 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2595 IsBLASCompatible_v< ElementType_t<T1> > &&
2596 IsBLASCompatible_v< ElementType_t<T2> > &&
2597 IsBLASCompatible_v< ElementType_t<T3> > &&
2598 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2599 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2600 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2608 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2609 static constexpr bool UseVectorizedDefaultKernel_v =
2610 ( useOptimizedKernels &&
2611 !IsDiagonal_v<T3> &&
2612 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2613 IsSIMDCombinable_v< ElementType_t<T1>
2617 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2618 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2624 using This = DVecScalarMultExpr<VMM,ST,true>;
2627 using BaseType = VecScalarMultExpr< DenseVector<This,true> >;
2632 using SIMDType = SIMDTrait_t<ElementType>;
2637 using LeftOperand =
const TDVecDMatMultExpr<VT,MT>;
2643 using LT = If_t< evaluateVector, const VRT, VCT >;
2646 using RT = If_t< evaluateMatrix, const MRT, MCT >;
2652 ( !IsDiagonal_v<MT> &&
2653 VT::simdEnabled && MT::simdEnabled &&
2654 IsSIMDCombinable_v<VET,MET,ST> &&
2655 HasSIMDAdd_v<VET,MET> &&
2656 HasSIMDMult_v<VET,MET> );
2660 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2700 if( index >=
vector_.size() ) {
2703 return (*
this)[index];
2712 inline size_t size()
const {
2743 template<
typename T >
2744 inline bool canAlias(
const T* alias )
const {
2745 return vector_.canAlias( alias );
2755 template<
typename T >
2756 inline bool isAliased(
const T* alias )
const {
2757 return vector_.isAliased( alias );
2777 RightOperand_t<VMM> A(
vector_.rightOperand() );
2781 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2782 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2783 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2805 template<
typename VT1 >
2806 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2812 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2813 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2815 if( right.rows() == 0UL ||
2816 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
2820 else if( right.columns() == 0UL ) {
2832 DVecScalarMultExpr::selectAssignKernel( *lhs, x, A, rhs.scalar_ );
2847 template<
typename VT1
2851 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2853 if( ( IsDiagonal_v<MT1> ) ||
2854 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2855 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2856 selectSmallAssignKernel( y, x, A, scalar );
2858 selectBlasAssignKernel( y, x, A, scalar );
2876 template<
typename VT1
2880 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2882 const size_t M( A.rows() );
2883 const size_t N( A.columns() );
2885 if( IsStrictlyUpper_v<MT1> ) {
2889 if( !IsLower_v<MT1> )
2891 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<N; ++j ) {
2892 y[j] = x[0UL] * A(0UL,j);
2896 for(
size_t i=( IsLower_v<MT1> && !IsStrictlyLower_v<MT1> ? 0UL : 1UL ); i<M; ++i )
2898 if( IsDiagonal_v<MT1> )
2900 y[i] = x[i] * A(i,i) * scalar;
2904 const size_t jbegin( ( IsUpper_v<MT1> )
2905 ?( IsStrictlyUpper_v<MT1> ? i+1UL : i )
2907 const size_t jend( ( IsLower_v<MT1> )
2908 ?( IsStrictlyLower_v<MT1> ? i-1UL : i )
2912 const size_t jnum( jend - jbegin );
2913 const size_t jpos( jbegin +
prevMultiple( jnum, 2UL ) );
2916 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2917 y[j ] += x[i] * A(i,j );
2918 y[j+1UL] += x[i] * A(i,j+1UL);
2921 y[jpos] += x[i] * A(i,jpos);
2923 if( IsLower_v<MT1> ) {
2924 y[jend] = x[i] * A(i,jend);
2929 if( IsStrictlyLower_v<MT1> ) {
2933 if( !IsDiagonal_v<MT1> )
2935 const size_t iend( IsStrictlyLower_v<MT1> ? N-1UL : N );
2936 for(
size_t j=( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ); j<iend; ++j ) {
2957 template<
typename VT1
2961 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2962 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2964 selectDefaultAssignKernel( y, x, A, scalar );
2982 template<
typename VT1
2986 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2987 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
2989 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
2991 const size_t M( A.rows() );
2992 const size_t N( A.columns() );
2997 const SIMDType factor(
set( scalar ) );
3003 const size_t ibegin( ( IsLower_v<MT1> )
3004 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3006 const size_t iend( ( IsUpper_v<MT1> )
3007 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3011 SIMDType x1(
set( x[ibegin] ) );
3012 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3013 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3014 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3015 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
3016 SIMDType xmm5( x1 * A.load(ibegin,j+
SIMDSIZE*4UL) );
3017 SIMDType xmm6( x1 * A.load(ibegin,j+
SIMDSIZE*5UL) );
3018 SIMDType xmm7( x1 * A.load(ibegin,j+
SIMDSIZE*6UL) );
3019 SIMDType xmm8( x1 * A.load(ibegin,j+
SIMDSIZE*7UL) );
3021 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3023 xmm1 += x1 * A.load(i,j );
3024 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3025 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3026 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3027 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3028 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3029 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3030 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3033 y.store( j , xmm1*factor );
3034 y.store( j+
SIMDSIZE , xmm2*factor );
3035 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
3036 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
3037 y.store( j+
SIMDSIZE*4UL, xmm5*factor );
3038 y.store( j+
SIMDSIZE*5UL, xmm6*factor );
3039 y.store( j+
SIMDSIZE*6UL, xmm7*factor );
3040 y.store( j+
SIMDSIZE*7UL, xmm8*factor );
3045 const size_t ibegin( ( IsLower_v<MT1> )
3046 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3048 const size_t iend( ( IsUpper_v<MT1> )
3049 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3053 SIMDType x1(
set( x[ibegin] ) );
3054 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3055 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3056 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3057 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
3059 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3061 xmm1 += x1 * A.load(i,j );
3062 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3063 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3064 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3067 y.store( j , xmm1*factor );
3068 y.store( j+
SIMDSIZE , xmm2*factor );
3069 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
3070 y.store( j+
SIMDSIZE*3UL, xmm4*factor );
3075 const size_t ibegin( ( IsLower_v<MT1> )
3076 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3078 const size_t iend( ( IsUpper_v<MT1> )
3079 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3083 SIMDType x1(
set( x[ibegin] ) );
3084 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3085 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3086 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3088 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3090 xmm1 += x1 * A.load(i,j );
3091 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3092 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3095 y.store( j , xmm1*factor );
3096 y.store( j+
SIMDSIZE , xmm2*factor );
3097 y.store( j+
SIMDSIZE*2UL, xmm3*factor );
3102 const size_t ibegin( ( IsLower_v<MT1> )
3103 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3105 const size_t iend( ( IsUpper_v<MT1> )
3106 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3110 SIMDType x1(
set( x[ibegin] ) );
3111 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3112 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE) );
3114 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3116 xmm1 += x1 * A.load(i,j );
3120 y.store( j , xmm1*factor );
3121 y.store( j+
SIMDSIZE, xmm2*factor );
3126 const size_t ibegin( ( IsLower_v<MT1> )
3127 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3129 const size_t iend( ( IsUpper_v<MT1> )
3130 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3134 SIMDType xmm1(
set( x[ibegin] ) * A.load(ibegin,j) );
3136 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3137 xmm1 +=
set( x[i] ) * A.load(i,j);
3140 y.store( j, xmm1*factor );
3143 for( ; remainder && j<N; ++j )
3145 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
3146 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
3151 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3152 value += x[i] * A(i,j);
3155 y[j] = value * scalar;
3174 template<
typename VT1
3178 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3179 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3181 selectDefaultAssignKernel( y, x, A, scalar );
3199 template<
typename VT1
3203 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3204 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3206 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3208 const size_t M( A.rows() );
3209 const size_t N( A.columns() );
3211 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3212 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3214 const SIMDType factor(
set( scalar ) );
3220 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3221 for(
size_t ii=0UL; ii<M; ii+=iblock )
3223 const size_t iend(
min( ii+iblock, M ) );
3224 const size_t jtmp(
min( jj+jblock, N ) );
3225 const size_t jend( ( IsLower_v<MT1> )
3226 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3232 size_t j( ( IsUpper_v<MT1> )
3238 SIMDType x1(
set( x[ii] ) );
3239 SIMDType xmm1( x1 * A.load(ii,j ) );
3240 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3241 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3242 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
3243 SIMDType xmm5( x1 * A.load(ii,j+
SIMDSIZE*4UL) );
3244 SIMDType xmm6( x1 * A.load(ii,j+
SIMDSIZE*5UL) );
3245 SIMDType xmm7( x1 * A.load(ii,j+
SIMDSIZE*6UL) );
3246 SIMDType xmm8( x1 * A.load(ii,j+
SIMDSIZE*7UL) );
3248 for(
size_t i=ii+1UL; i<iend; ++i ) {
3250 xmm1 += x1 * A.load(i,j );
3251 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3252 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3253 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3254 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3255 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3256 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3257 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3260 y.store( j , y.load(j ) + xmm1*factor );
3272 SIMDType x1(
set( x[ii] ) );
3273 SIMDType xmm1( x1 * A.load(ii,j ) );
3274 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3275 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3276 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
3278 for(
size_t i=ii+1UL; i<iend; ++i ) {
3280 xmm1 += x1 * A.load(i,j );
3281 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3282 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3283 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3286 y.store( j , y.load(j ) + xmm1*factor );
3294 SIMDType x1(
set( x[ii] ) );
3295 SIMDType xmm1( x1 * A.load(ii,j ) );
3296 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3297 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3299 for(
size_t i=ii+1UL; i<iend; ++i ) {
3301 xmm1 += x1 * A.load(i,j );
3302 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3303 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3306 y.store( j , y.load(j ) + xmm1*factor );
3313 SIMDType x1(
set( x[ii] ) );
3314 SIMDType xmm1( x1 * A.load(ii,j ) );
3315 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE) );
3317 for(
size_t i=ii+1UL; i<iend; ++i ) {
3319 xmm1 += x1 * A.load(i,j );
3323 y.store( j , y.load(j ) + xmm1*factor );
3329 SIMDType xmm1(
set( x[ii] ) * A.load(ii,j) );
3331 for(
size_t i=ii+1UL; i<iend; ++i ) {
3332 xmm1 +=
set( x[i] ) * A.load(i,j);
3335 y.store( j, y.load(j) + xmm1*factor );
3338 for( ; remainder && j<jend; ++j )
3342 for(
size_t i=ii+1UL; i<iend; ++i ) {
3343 value += x[i] * A(i,j);
3346 y[j] += value * scalar;
3366 template<
typename VT1
3370 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3371 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3373 selectLargeAssignKernel( y, x, A, scalar );
3378#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3392 template<
typename VT1
3396 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3397 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3399 using ET = ElementType_t<VT1>;
3401 if( IsTriangular_v<MT1> ) {
3402 assign( y, scalar * x );
3403 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3406 gemv( y, x, A,
ET(scalar),
ET(0) );
3424 template<
typename VT1 >
3425 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3436 assign( *lhs, tmp );
3452 template<
typename VT1 >
3453 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3459 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3460 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3462 if( right.rows() == 0UL || right.columns() == 0UL ||
3463 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
3475 DVecScalarMultExpr::selectAddAssignKernel( *lhs, x, A, rhs.scalar_ );
3490 template<
typename VT1
3494 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3496 if( ( IsDiagonal_v<MT1> ) ||
3497 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3498 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3499 selectSmallAddAssignKernel( y, x, A, scalar );
3501 selectBlasAddAssignKernel( y, x, A, scalar );
3519 template<
typename VT1
3523 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3525 y.addAssign( x * A * scalar );
3543 template<
typename VT1
3547 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3548 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3550 selectDefaultAddAssignKernel( y, x, A, scalar );
3569 template<
typename VT1
3573 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3574 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3576 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3578 const size_t M( A.rows() );
3579 const size_t N( A.columns() );
3584 const SIMDType factor(
set( scalar ) );
3590 const size_t ibegin( ( IsLower_v<MT1> )
3591 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3593 const size_t iend( ( IsUpper_v<MT1> )
3594 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3598 SIMDType x1(
set( x[ibegin] ) );
3599 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3600 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3601 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3602 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
3603 SIMDType xmm5( x1 * A.load(ibegin,j+
SIMDSIZE*4UL) );
3604 SIMDType xmm6( x1 * A.load(ibegin,j+
SIMDSIZE*5UL) );
3605 SIMDType xmm7( x1 * A.load(ibegin,j+
SIMDSIZE*6UL) );
3606 SIMDType xmm8( x1 * A.load(ibegin,j+
SIMDSIZE*7UL) );
3608 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3610 xmm1 += x1 * A.load(i,j );
3611 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3612 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3613 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3614 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3615 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3616 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3617 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3620 y.store( j , y.load(j ) + xmm1*factor );
3632 const size_t ibegin( ( IsLower_v<MT1> )
3633 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3635 const size_t iend( ( IsUpper_v<MT1> )
3636 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3640 SIMDType x1(
set( x[ibegin] ) );
3641 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3642 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3643 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3644 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
3646 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3648 xmm1 += x1 * A.load(i,j );
3649 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3650 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3651 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3654 y.store( j , y.load(j ) + xmm1*factor );
3662 const size_t ibegin( ( IsLower_v<MT1> )
3663 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3665 const size_t iend( ( IsUpper_v<MT1> )
3666 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3670 SIMDType x1(
set( x[ibegin] ) );
3671 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3672 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
3673 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
3675 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3677 xmm1 += x1 * A.load(i,j );
3678 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3679 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3682 y.store( j , y.load(j ) + xmm1*factor );
3689 const size_t ibegin( ( IsLower_v<MT1> )
3690 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3692 const size_t iend( ( IsUpper_v<MT1> )
3693 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3697 SIMDType x1(
set( x[ibegin] ) );
3698 SIMDType xmm1( x1 * A.load(ibegin,j ) );
3699 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE) );
3701 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3703 xmm1 += x1 * A.load(i,j );
3707 y.store( j , y.load(j ) + xmm1*factor );
3713 const size_t ibegin( ( IsLower_v<MT1> )
3714 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
3716 const size_t iend( ( IsUpper_v<MT1> )
3717 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
3721 SIMDType xmm1(
set( x[ibegin] ) * A.load(ibegin,j) );
3723 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3724 xmm1 +=
set( x[i] ) * A.load(i,j);
3727 y.store( j, y.load(j) + xmm1*factor );
3730 for( ; remainder && j<N; ++j )
3732 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
3733 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
3738 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
3739 value += x[i] * A(i,j);
3742 y[j] += value * scalar;
3761 template<
typename VT1
3765 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3766 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3768 selectDefaultAddAssignKernel( y, x, A, scalar );
3787 template<
typename VT1
3791 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3792 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3794 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
3796 const size_t M( A.rows() );
3797 const size_t N( A.columns() );
3799 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
3800 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3802 const SIMDType factor(
set( scalar ) );
3806 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3807 for(
size_t ii=0UL; ii<M; ii+=iblock )
3809 const size_t iend(
min( ii+iblock, M ) );
3810 const size_t jtmp(
min( jj+jblock, N ) );
3811 const size_t jend( ( IsLower_v<MT1> )
3812 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
3818 size_t j( ( IsUpper_v<MT1> )
3824 SIMDType x1(
set( x[ii] ) );
3825 SIMDType xmm1( x1 * A.load(ii,j ) );
3826 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3827 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3828 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
3829 SIMDType xmm5( x1 * A.load(ii,j+
SIMDSIZE*4UL) );
3830 SIMDType xmm6( x1 * A.load(ii,j+
SIMDSIZE*5UL) );
3831 SIMDType xmm7( x1 * A.load(ii,j+
SIMDSIZE*6UL) );
3832 SIMDType xmm8( x1 * A.load(ii,j+
SIMDSIZE*7UL) );
3834 for(
size_t i=ii+1UL; i<iend; ++i ) {
3836 xmm1 += x1 * A.load(i,j );
3837 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3838 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3839 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3840 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
3841 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
3842 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
3843 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
3846 y.store( j , y.load(j ) + xmm1*factor );
3858 SIMDType x1(
set( x[ii] ) );
3859 SIMDType xmm1( x1 * A.load(ii,j ) );
3860 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3861 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3862 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
3864 for(
size_t i=ii+1UL; i<iend; ++i ) {
3866 xmm1 += x1 * A.load(i,j );
3867 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3868 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3869 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
3872 y.store( j , y.load(j ) + xmm1*factor );
3880 SIMDType x1(
set( x[ii] ) );
3881 SIMDType xmm1( x1 * A.load(ii,j ) );
3882 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
3883 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
3885 for(
size_t i=ii+1UL; i<iend; ++i ) {
3887 xmm1 += x1 * A.load(i,j );
3888 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
3889 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
3892 y.store( j , y.load(j ) + xmm1*factor );
3899 SIMDType x1(
set( x[ii] ) );
3900 SIMDType xmm1( x1 * A.load(ii,j ) );
3901 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE) );
3903 for(
size_t i=ii+1UL; i<iend; ++i ) {
3905 xmm1 += x1 * A.load(i,j );
3909 y.store( j , y.load(j ) + xmm1*factor );
3915 SIMDType xmm1(
set( x[ii] ) * A.load(ii,j) );
3917 for(
size_t i=ii+1UL; i<iend; ++i ) {
3918 xmm1 +=
set( x[i] ) * A.load(i,j);
3921 y.store( j, y.load(j) + xmm1*factor );
3924 for( ; remainder && j<jend; ++j )
3928 for(
size_t i=ii+1UL; i<iend; ++i ) {
3929 value += x[i] * A(i,j);
3932 y[j] += value * scalar;
3953 template<
typename VT1
3957 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3958 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3960 selectLargeAddAssignKernel( y, x, A, scalar );
3965#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3979 template<
typename VT1
3983 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3984 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3986 using ET = ElementType_t<VT1>;
3988 if( IsTriangular_v<MT1> ) {
3989 ResultType_t<VT1> tmp(
serial( scalar * x ) );
3990 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3991 addAssign( y, tmp );
3994 gemv( y, x, A,
ET(scalar),
ET(1) );
4016 template<
typename VT1 >
4017 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4023 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4024 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4026 if( right.rows() == 0UL || right.columns() == 0UL ||
4027 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4039 DVecScalarMultExpr::selectSubAssignKernel( *lhs, x, A, rhs.scalar_ );
4054 template<
typename VT1
4058 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4060 if( ( IsDiagonal_v<MT1> ) ||
4061 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4062 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4063 selectSmallSubAssignKernel( y, x, A, scalar );
4065 selectBlasSubAssignKernel( y, x, A, scalar );
4083 template<
typename VT1
4087 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4089 y.subAssign( x * A * scalar );
4107 template<
typename VT1
4111 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4112 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4114 selectDefaultSubAssignKernel( y, x, A, scalar );
4133 template<
typename VT1
4137 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4138 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4140 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4142 const size_t M( A.rows() );
4143 const size_t N( A.columns() );
4148 const SIMDType factor(
set( scalar ) );
4154 const size_t ibegin( ( IsLower_v<MT1> )
4155 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4157 const size_t iend( ( IsUpper_v<MT1> )
4158 ?(
min( j+
SIMDSIZE*8UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4162 SIMDType x1(
set( x[ibegin] ) );
4163 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4164 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
4165 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
4166 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
4167 SIMDType xmm5( x1 * A.load(ibegin,j+
SIMDSIZE*4UL) );
4168 SIMDType xmm6( x1 * A.load(ibegin,j+
SIMDSIZE*5UL) );
4169 SIMDType xmm7( x1 * A.load(ibegin,j+
SIMDSIZE*6UL) );
4170 SIMDType xmm8( x1 * A.load(ibegin,j+
SIMDSIZE*7UL) );
4172 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4174 xmm1 += x1 * A.load(i,j );
4175 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4176 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4177 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4178 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4179 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4180 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4181 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4184 y.store( j , y.load(j ) - xmm1*factor );
4196 const size_t ibegin( ( IsLower_v<MT1> )
4197 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4199 const size_t iend( ( IsUpper_v<MT1> )
4200 ?(
min( j+
SIMDSIZE*4UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4204 SIMDType x1(
set( x[ibegin] ) );
4205 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4206 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
4207 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
4208 SIMDType xmm4( x1 * A.load(ibegin,j+
SIMDSIZE*3UL) );
4210 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4212 xmm1 += x1 * A.load(i,j );
4213 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4214 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4215 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4218 y.store( j , y.load(j ) - xmm1*factor );
4226 const size_t ibegin( ( IsLower_v<MT1> )
4227 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4229 const size_t iend( ( IsUpper_v<MT1> )
4230 ?(
min( j+
SIMDSIZE*3UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4234 SIMDType x1(
set( x[ibegin] ) );
4235 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4236 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE ) );
4237 SIMDType xmm3( x1 * A.load(ibegin,j+
SIMDSIZE*2UL) );
4239 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4241 xmm1 += x1 * A.load(i,j );
4242 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4243 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4246 y.store( j , y.load(j ) - xmm1*factor );
4253 const size_t ibegin( ( IsLower_v<MT1> )
4254 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4256 const size_t iend( ( IsUpper_v<MT1> )
4257 ?(
min( j+
SIMDSIZE*2UL, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4261 SIMDType x1(
set( x[ibegin] ) );
4262 SIMDType xmm1( x1 * A.load(ibegin,j ) );
4263 SIMDType xmm2( x1 * A.load(ibegin,j+
SIMDSIZE) );
4265 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4267 xmm1 += x1 * A.load(i,j );
4271 y.store( j , y.load(j ) - xmm1*factor );
4277 const size_t ibegin( ( IsLower_v<MT1> )
4278 ?( IsStrictlyLower_v<MT1> ? j+1UL : j )
4280 const size_t iend( ( IsUpper_v<MT1> )
4281 ?(
min( j+
SIMDSIZE, M ) - ( IsStrictlyUpper_v<MT1> ? 1UL : 0UL ) )
4285 SIMDType xmm1(
set( x[ibegin] ) * A.load(ibegin,j) );
4287 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4288 xmm1 +=
set( x[i] ) * A.load(i,j);
4291 y.store( j, y.load(j) - xmm1*factor );
4294 for( ; remainder && j<N; ++j )
4296 const size_t ibegin( ( IsLower_v<MT1> )?( j ):( 0UL ) );
4297 const size_t iend( ( IsUpper_v<MT1> )?(
min( j+1UL, M ) ):( M ) );
4302 for(
size_t i=ibegin+1UL; i<iend; ++i ) {
4303 value += x[i] * A(i,j);
4306 y[j] -= value * scalar;
4325 template<
typename VT1
4329 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4330 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4332 selectDefaultSubAssignKernel( y, x, A, scalar );
4351 template<
typename VT1
4355 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4356 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4358 constexpr bool remainder( !IsPadded_v<VT1> || !IsPadded_v<MT1> );
4360 const size_t M( A.rows() );
4361 const size_t N( A.columns() );
4363 const size_t jblock( 32768UL /
sizeof(
ElementType ) );
4364 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4366 const SIMDType factor(
set( scalar ) );
4370 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4371 for(
size_t ii=0UL; ii<M; ii+=iblock )
4373 const size_t iend(
min( ii+iblock, M ) );
4374 const size_t jtmp(
min( jj+jblock, N ) );
4375 const size_t jend( ( IsLower_v<MT1> )
4376 ?(
min( jtmp, ( IsStrictlyLower_v<MT1> ? iend-1UL : iend ) ) )
4382 size_t j( ( IsUpper_v<MT1> )
4388 SIMDType x1(
set( x[ii] ) );
4389 SIMDType xmm1( x1 * A.load(ii,j ) );
4390 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
4391 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
4392 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
4393 SIMDType xmm5( x1 * A.load(ii,j+
SIMDSIZE*4UL) );
4394 SIMDType xmm6( x1 * A.load(ii,j+
SIMDSIZE*5UL) );
4395 SIMDType xmm7( x1 * A.load(ii,j+
SIMDSIZE*6UL) );
4396 SIMDType xmm8( x1 * A.load(ii,j+
SIMDSIZE*7UL) );
4398 for(
size_t i=ii+1UL; i<iend; ++i ) {
4400 xmm1 += x1 * A.load(i,j );
4401 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4402 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4403 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4404 xmm5 += x1 * A.load(i,j+
SIMDSIZE*4UL);
4405 xmm6 += x1 * A.load(i,j+
SIMDSIZE*5UL);
4406 xmm7 += x1 * A.load(i,j+
SIMDSIZE*6UL);
4407 xmm8 += x1 * A.load(i,j+
SIMDSIZE*7UL);
4410 y.store( j , y.load(j ) - xmm1*factor );
4422 SIMDType x1(
set( x[ii] ) );
4423 SIMDType xmm1( x1 * A.load(ii,j ) );
4424 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
4425 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
4426 SIMDType xmm4( x1 * A.load(ii,j+
SIMDSIZE*3UL) );
4428 for(
size_t i=ii+1UL; i<iend; ++i ) {
4430 xmm1 += x1 * A.load(i,j );
4431 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4432 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4433 xmm4 += x1 * A.load(i,j+
SIMDSIZE*3UL);
4436 y.store( j , y.load(j ) - xmm1*factor );
4444 SIMDType x1(
set( x[ii] ) );
4445 SIMDType xmm1( x1 * A.load(ii,j ) );
4446 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE ) );
4447 SIMDType xmm3( x1 * A.load(ii,j+
SIMDSIZE*2UL) );
4449 for(
size_t i=ii+1UL; i<iend; ++i ) {
4451 xmm1 += x1 * A.load(i,j );
4452 xmm2 += x1 * A.load(i,j+
SIMDSIZE );
4453 xmm3 += x1 * A.load(i,j+
SIMDSIZE*2UL);
4456 y.store( j , y.load(j ) - xmm1*factor );
4463 SIMDType x1(
set( x[ii] ) );
4464 SIMDType xmm1( x1 * A.load(ii,j ) );
4465 SIMDType xmm2( x1 * A.load(ii,j+
SIMDSIZE) );
4467 for(
size_t i=ii+1UL; i<iend; ++i ) {
4469 xmm1 += x1 * A.load(i,j );
4473 y.store( j , y.load(j ) - xmm1*factor );
4479 SIMDType xmm1(
set( x[ii] ) * A.load(ii,j) );
4481 for(
size_t i=ii+1UL; i<iend; ++i ) {
4482 xmm1 +=
set( x[i] ) * A.load(i,j);
4485 y.store( j, y.load(j) - xmm1*factor );
4488 for( ; remainder && j<jend; ++j )
4492 for(
size_t i=ii+1UL; i<iend; ++i ) {
4493 value += x[i] * A(i,j);
4496 y[j] -= value * scalar;
4517 template<
typename VT1
4521 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4522 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4524 selectLargeSubAssignKernel( y, x, A, scalar );
4529#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4543 template<
typename VT1
4547 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4548 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4550 using ET = ElementType_t<VT1>;
4552 if( IsTriangular_v<MT1> ) {
4553 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4554 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4555 subAssign( y, tmp );
4558 gemv( y, x, A,
ET(-scalar),
ET(1) );
4580 template<
typename VT1 >
4581 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4592 multAssign( *lhs, tmp );
4612 template<
typename VT1 >
4613 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4624 divAssign( *lhs, tmp );
4646 template<
typename VT1 >
4648 -> EnableIf_t< UseSMPAssign_v<VT1> >
4654 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4655 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4657 if( right.rows() == 0UL ||
4658 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4662 else if( right.columns() == 0UL ) {
4692 template<
typename VT1 >
4694 -> EnableIf_t< UseSMPAssign_v<VT1> >
4723 template<
typename VT1 >
4725 -> EnableIf_t< UseSMPAssign_v<VT1> >
4731 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4732 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4734 if( right.rows() == 0UL || right.columns() == 0UL ||
4735 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4769 template<
typename VT1 >
4771 -> EnableIf_t< UseSMPAssign_v<VT1> >
4777 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4778 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4780 if( right.rows() == 0UL || right.columns() == 0UL ||
4781 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4816 template<
typename VT1 >
4818 -> EnableIf_t< UseSMPAssign_v<VT1> >
4851 template<
typename VT1 >
4853 -> EnableIf_t< UseSMPAssign_v<VT1> >
4926template<
typename VT
4928inline decltype(
auto)
4935 if( (*vec).size() != (*mat).rows() ) {
4940 return ReturnType( *vec, *mat );
4955template<
typename VT,
typename MT >
4956struct IsAligned< TDVecDMatMultExpr<VT,MT> >
4957 :
public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Constraint on the transpose flag of vector types.
Header file for all SIMD functionality.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense vector-dense matrix multiplications.
Definition: TDVecDMatMultExpr.h:129
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:269
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecDMatMultExpr.h:240
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:348
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:213
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:303
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:392
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:228
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:216
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:212
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:215
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:336
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:147
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:211
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecDMatMultExpr.h:233
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:137
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:255
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:214
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:393
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:360
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:326
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:225
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:222
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:142
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:136
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:380
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:219
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:134
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecDMatMultExpr.h:246
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:370
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:316
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the TVecMatMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) min(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise minimum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1339
decltype(auto) max(const DenseMatrix< MT1, SO1 > &lhs, const DenseMatrix< MT2, SO2 > &rhs)
Computes the componentwise maximum of the dense matrices lhs and rhs.
Definition: DMatDMatMapExpr.h:1375
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: TVecMatMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.
Definition: RowVector.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
BLAZE_ALWAYS_INLINE const EnableIf_t< IsIntegral_v< T > &&HasSize_v< T, 1UL >, If_t< IsSigned_v< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:75
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all vector/matrix multiplication expression templates.
Definition: TVecMatMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.
Header file for the generic max algorithm.
Header file for the generic min algorithm.