35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_ 119 template<
typename MT
122 :
public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
138 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
139 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
144 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
153 template<
typename T1 >
163 template<
typename T1,
typename T2,
typename T3 >
164 static constexpr
bool UseBlasKernel_v =
166 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
167 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
168 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
170 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
171 IsBLASCompatible_v< ElementType_t<T1> > &&
172 IsBLASCompatible_v< ElementType_t<T2> > &&
173 IsBLASCompatible_v< ElementType_t<T3> > &&
185 template<
typename T1,
typename T2,
typename T3 >
186 static constexpr
bool UseVectorizedDefaultKernel_v =
187 ( useOptimizedKernels &&
189 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
190 IsSIMDCombinable_v< ElementType_t<T1>
225 ( !IsDiagonal_v<MT> &&
226 MT::simdEnabled && VT::simdEnabled &&
227 HasSIMDAdd_v<MET,VET> &&
228 HasSIMDMult_v<MET,VET> );
263 if( IsDiagonal_v<MT> )
265 return mat_(index,index) *
vec_[index];
267 else if( IsLower_v<MT> && ( index + 8UL <
mat_.rows() ) )
269 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
273 else if( IsUpper_v<MT> && ( index > 8UL ) )
275 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
276 const size_t n (
mat_.columns() -
begin );
295 if( index >=
mat_.rows() ) {
298 return (*
this)[index];
307 inline size_t size() const noexcept {
338 template<
typename T >
339 inline bool canAlias(
const T* alias )
const noexcept {
340 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
350 template<
typename T >
351 inline bool isAliased(
const T* alias )
const noexcept {
352 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
362 return mat_.isAligned() &&
vec_.isAligned();
376 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
377 (
size() > SMP_DMATDVECMULT_THRESHOLD );
400 template<
typename VT1 >
407 if( rhs.
mat_.rows() == 0UL ) {
410 else if( rhs.
mat_.columns() == 0UL ) {
423 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
439 template<
typename VT1
442 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
444 if( ( IsDiagonal_v<MT1> ) ||
446 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
447 selectSmallAssignKernel( y, A, x );
449 selectBlasAssignKernel( y, A, x );
468 template<
typename VT1
471 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
492 template<
typename VT1
495 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
496 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
498 selectDefaultAssignKernel( y, A, x );
517 template<
typename VT1
520 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
521 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
523 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
525 const size_t M( A.rows() );
526 const size_t N( A.columns() );
530 for( ; (i+8UL) <= M; i+=8UL )
532 const size_t jbegin( ( IsUpper_v<MT1> )
533 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
535 const size_t jend( ( IsLower_v<MT1> )
536 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
540 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
543 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
548 xmm1 += A.load(i ,j) * x1;
549 xmm2 += A.load(i+1UL,j) * x1;
550 xmm3 += A.load(i+2UL,j) * x1;
551 xmm4 += A.load(i+3UL,j) * x1;
552 xmm5 += A.load(i+4UL,j) * x1;
553 xmm6 += A.load(i+5UL,j) * x1;
554 xmm7 += A.load(i+6UL,j) * x1;
555 xmm8 += A.load(i+7UL,j) * x1;
559 y[i+1UL] =
sum( xmm2 );
560 y[i+2UL] =
sum( xmm3 );
561 y[i+3UL] =
sum( xmm4 );
562 y[i+4UL] =
sum( xmm5 );
563 y[i+5UL] =
sum( xmm6 );
564 y[i+6UL] =
sum( xmm7 );
565 y[i+7UL] =
sum( xmm8 );
567 for( ; remainder && j<jend; ++j ) {
568 y[i ] += A(i ,j) * x[j];
569 y[i+1UL] += A(i+1UL,j) * x[j];
570 y[i+2UL] += A(i+2UL,j) * x[j];
571 y[i+3UL] += A(i+3UL,j) * x[j];
572 y[i+4UL] += A(i+4UL,j) * x[j];
573 y[i+5UL] += A(i+5UL,j) * x[j];
574 y[i+6UL] += A(i+6UL,j) * x[j];
575 y[i+7UL] += A(i+7UL,j) * x[j];
579 for( ; (i+4UL) <= M; i+=4UL )
581 const size_t jbegin( ( IsUpper_v<MT1> )
582 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
584 const size_t jend( ( IsLower_v<MT1> )
585 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
589 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
597 xmm1 += A.load(i ,j) * x1;
598 xmm2 += A.load(i+1UL,j) * x1;
599 xmm3 += A.load(i+2UL,j) * x1;
600 xmm4 += A.load(i+3UL,j) * x1;
604 y[i+1UL] =
sum( xmm2 );
605 y[i+2UL] =
sum( xmm3 );
606 y[i+3UL] =
sum( xmm4 );
608 for( ; remainder && j<jend; ++j ) {
609 y[i ] += A(i ,j) * x[j];
610 y[i+1UL] += A(i+1UL,j) * x[j];
611 y[i+2UL] += A(i+2UL,j) * x[j];
612 y[i+3UL] += A(i+3UL,j) * x[j];
616 for( ; (i+3UL) <= M; i+=3UL )
618 const size_t jbegin( ( IsUpper_v<MT1> )
619 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
621 const size_t jend( ( IsLower_v<MT1> )
622 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
626 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
634 xmm1 += A.load(i ,j) * x1;
635 xmm2 += A.load(i+1UL,j) * x1;
636 xmm3 += A.load(i+2UL,j) * x1;
640 y[i+1UL] =
sum( xmm2 );
641 y[i+2UL] =
sum( xmm3 );
643 for( ; remainder && j<jend; ++j ) {
644 y[i ] += A(i ,j) * x[j];
645 y[i+1UL] += A(i+1UL,j) * x[j];
646 y[i+2UL] += A(i+2UL,j) * x[j];
650 for( ; (i+2UL) <= M; i+=2UL )
652 const size_t jbegin( ( IsUpper_v<MT1> )
653 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
655 const size_t jend( ( IsLower_v<MT1> )
656 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
660 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
668 xmm1 += A.load(i ,j) * x1;
669 xmm2 += A.load(i+1UL,j) * x1;
673 y[i+1UL] =
sum( xmm2 );
675 for( ; remainder && j<jend; ++j ) {
676 y[i ] += A(i ,j) * x[j];
677 y[i+1UL] += A(i+1UL,j) * x[j];
683 const size_t jbegin( ( IsUpper_v<MT1> )
684 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
686 const size_t jend( ( IsLower_v<MT1> )
687 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
691 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
698 xmm1 += A.load(i,j) * x.load(j);
703 for( ; remainder && j<jend; ++j ) {
704 y[i] += A(i,j) * x[j];
725 template<
typename VT1
728 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
729 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
731 selectDefaultAssignKernel( y, A, x );
750 template<
typename VT1
753 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
754 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
756 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
758 const size_t M( A.rows() );
759 const size_t N( A.columns() );
765 for( ; (i+8UL) <= M; i+=8UL )
767 const size_t jbegin( ( IsUpper_v<MT1> )
768 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
770 const size_t jend( ( IsLower_v<MT1> )
771 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
775 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
788 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
789 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
790 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
791 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
792 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
793 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
794 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
795 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
802 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
803 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
804 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
805 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
806 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
807 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
808 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
809 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
814 y[i ] +=
sum( A.load(i ,j) * x1 );
815 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
816 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
817 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
818 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
819 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
820 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
821 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
824 for( ; remainder && j<jend; ++j ) {
825 y[i ] += A(i ,j) * x[j];
826 y[i+1UL] += A(i+1UL,j) * x[j];
827 y[i+2UL] += A(i+2UL,j) * x[j];
828 y[i+3UL] += A(i+3UL,j) * x[j];
829 y[i+4UL] += A(i+4UL,j) * x[j];
830 y[i+5UL] += A(i+5UL,j) * x[j];
831 y[i+6UL] += A(i+6UL,j) * x[j];
832 y[i+7UL] += A(i+7UL,j) * x[j];
836 for( ; (i+4UL) <= M; i+=4UL )
838 const size_t jbegin( ( IsUpper_v<MT1> )
839 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
841 const size_t jend( ( IsLower_v<MT1> )
842 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
846 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
859 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
860 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
861 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
862 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
869 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
870 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
871 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
872 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
877 y[i ] +=
sum( A.load(i ,j) * x1 );
878 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
879 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
880 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
883 for( ; remainder && j<jend; ++j ) {
884 y[i ] += A(i ,j) * x[j];
885 y[i+1UL] += A(i+1UL,j) * x[j];
886 y[i+2UL] += A(i+2UL,j) * x[j];
887 y[i+3UL] += A(i+3UL,j) * x[j];
891 for( ; (i+2UL) <= M; i+=2UL )
893 const size_t jbegin( ( IsUpper_v<MT1> )
894 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
896 const size_t jend( ( IsLower_v<MT1> )
897 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
901 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
914 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
915 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
922 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
923 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
928 y[i ] +=
sum( A.load(i ,j) * x1 );
929 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
932 for( ; remainder && j<jend; ++j ) {
933 y[i ] += A(i ,j) * x[j];
934 y[i+1UL] += A(i+1UL,j) * x[j];
940 const size_t jbegin( ( IsUpper_v<MT1> )
941 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
943 const size_t jend( ( IsLower_v<MT1> )
944 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
948 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
961 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
968 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
973 y[i] +=
sum( A.load(i,j) * x1 );
976 for( ; remainder && j<jend; ++j ) {
977 y[i] += A(i,j) * x[j];
998 template<
typename VT1
1001 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1002 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1004 selectLargeAssignKernel( y, A, x );
1010 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1024 template<
typename VT1
1027 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1028 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1030 using ET = ElementType_t<VT1>;
1032 if( IsTriangular_v<MT1> ) {
1034 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1037 gemv( y, A, x, ET(1), ET(0) );
1057 template<
typename VT1 >
1058 friend inline void assign( SparseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1069 assign( ~lhs, tmp );
1087 template<
typename VT1 >
1088 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1094 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1106 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1122 template<
typename VT1
1125 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1127 if( ( IsDiagonal_v<MT1> ) ||
1129 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1130 selectSmallAddAssignKernel( y, A, x );
1132 selectBlasAddAssignKernel( y, A, x );
1151 template<
typename VT1
1154 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1156 y.addAssign( A * x );
1175 template<
typename VT1
1178 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1179 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1181 selectDefaultAddAssignKernel( y, A, x );
1200 template<
typename VT1
1203 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1204 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1206 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1208 const size_t M( A.rows() );
1209 const size_t N( A.columns() );
1213 for( ; (i+8UL) <= M; i+=8UL )
1215 const size_t jbegin( ( IsUpper_v<MT1> )
1216 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1218 const size_t jend( ( IsLower_v<MT1> )
1219 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1223 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1226 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1231 xmm1 += A.load(i ,j) * x1;
1232 xmm2 += A.load(i+1UL,j) * x1;
1233 xmm3 += A.load(i+2UL,j) * x1;
1234 xmm4 += A.load(i+3UL,j) * x1;
1235 xmm5 += A.load(i+4UL,j) * x1;
1236 xmm6 += A.load(i+5UL,j) * x1;
1237 xmm7 += A.load(i+6UL,j) * x1;
1238 xmm8 += A.load(i+7UL,j) * x1;
1241 y[i ] +=
sum( xmm1 );
1242 y[i+1UL] +=
sum( xmm2 );
1243 y[i+2UL] +=
sum( xmm3 );
1244 y[i+3UL] +=
sum( xmm4 );
1245 y[i+4UL] +=
sum( xmm5 );
1246 y[i+5UL] +=
sum( xmm6 );
1247 y[i+6UL] +=
sum( xmm7 );
1248 y[i+7UL] +=
sum( xmm8 );
1250 for( ; remainder && j<jend; ++j ) {
1251 y[i ] += A(i ,j) * x[j];
1252 y[i+1UL] += A(i+1UL,j) * x[j];
1253 y[i+2UL] += A(i+2UL,j) * x[j];
1254 y[i+3UL] += A(i+3UL,j) * x[j];
1255 y[i+4UL] += A(i+4UL,j) * x[j];
1256 y[i+5UL] += A(i+5UL,j) * x[j];
1257 y[i+6UL] += A(i+6UL,j) * x[j];
1258 y[i+7UL] += A(i+7UL,j) * x[j];
1262 for( ; (i+4UL) <= M; i+=4UL )
1264 const size_t jbegin( ( IsUpper_v<MT1> )
1265 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1267 const size_t jend( ( IsLower_v<MT1> )
1268 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1272 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1280 xmm1 += A.load(i ,j) * x1;
1281 xmm2 += A.load(i+1UL,j) * x1;
1282 xmm3 += A.load(i+2UL,j) * x1;
1283 xmm4 += A.load(i+3UL,j) * x1;
1286 y[i ] +=
sum( xmm1 );
1287 y[i+1UL] +=
sum( xmm2 );
1288 y[i+2UL] +=
sum( xmm3 );
1289 y[i+3UL] +=
sum( xmm4 );
1291 for( ; remainder && j<jend; ++j ) {
1292 y[i ] += A(i ,j) * x[j];
1293 y[i+1UL] += A(i+1UL,j) * x[j];
1294 y[i+2UL] += A(i+2UL,j) * x[j];
1295 y[i+3UL] += A(i+3UL,j) * x[j];
1299 for( ; (i+3UL) <= M; i+=3UL )
1301 const size_t jbegin( ( IsUpper_v<MT1> )
1302 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1304 const size_t jend( ( IsLower_v<MT1> )
1305 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1309 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1317 xmm1 += A.load(i ,j) * x1;
1318 xmm2 += A.load(i+1UL,j) * x1;
1319 xmm3 += A.load(i+2UL,j) * x1;
1322 y[i ] +=
sum( xmm1 );
1323 y[i+1UL] +=
sum( xmm2 );
1324 y[i+2UL] +=
sum( xmm3 );
1326 for( ; remainder && j<jend; ++j ) {
1327 y[i ] += A(i ,j) * x[j];
1328 y[i+1UL] += A(i+1UL,j) * x[j];
1329 y[i+2UL] += A(i+2UL,j) * x[j];
1333 for( ; (i+2UL) <= M; i+=2UL )
1335 const size_t jbegin( ( IsUpper_v<MT1> )
1336 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1338 const size_t jend( ( IsLower_v<MT1> )
1339 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1343 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1351 xmm1 += A.load(i ,j) * x1;
1352 xmm2 += A.load(i+1UL,j) * x1;
1355 y[i ] +=
sum( xmm1 );
1356 y[i+1UL] +=
sum( xmm2 );
1358 for( ; remainder && j<jend; ++j ) {
1359 y[i ] += A(i ,j) * x[j];
1360 y[i+1UL] += A(i+1UL,j) * x[j];
1366 const size_t jbegin( ( IsUpper_v<MT1> )
1367 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1369 const size_t jend( ( IsLower_v<MT1> )
1370 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1374 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1381 xmm1 += A.load(i,j) * x.load(j);
1384 y[i] +=
sum( xmm1 );
1386 for( ; remainder && j<jend; ++j ) {
1387 y[i] += A(i,j) * x[j];
1408 template<
typename VT1
1411 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1412 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1414 selectDefaultAddAssignKernel( y, A, x );
1433 template<
typename VT1
1436 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1437 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1439 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1441 const size_t M( A.rows() );
1442 const size_t N( A.columns() );
1446 for( ; (i+8UL) <= M; i+=8UL )
1448 const size_t jbegin( ( IsUpper_v<MT1> )
1449 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1451 const size_t jend( ( IsLower_v<MT1> )
1452 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1456 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1469 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1470 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1471 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1472 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1473 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1474 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1475 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1476 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1483 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1484 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1485 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1486 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1487 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1488 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1489 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1490 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1495 y[i ] +=
sum( A.load(i ,j) * x1 );
1496 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1497 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1498 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1499 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1500 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1501 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1502 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1505 for( ; remainder && j<jend; ++j ) {
1506 y[i ] += A(i ,j) * x[j];
1507 y[i+1UL] += A(i+1UL,j) * x[j];
1508 y[i+2UL] += A(i+2UL,j) * x[j];
1509 y[i+3UL] += A(i+3UL,j) * x[j];
1510 y[i+4UL] += A(i+4UL,j) * x[j];
1511 y[i+5UL] += A(i+5UL,j) * x[j];
1512 y[i+6UL] += A(i+6UL,j) * x[j];
1513 y[i+7UL] += A(i+7UL,j) * x[j];
1517 for( ; (i+4UL) <= M; i+=4UL )
1519 const size_t jbegin( ( IsUpper_v<MT1> )
1520 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1522 const size_t jend( ( IsLower_v<MT1> )
1523 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1527 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1540 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1541 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1542 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1543 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1550 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1551 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1552 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1553 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1558 y[i ] +=
sum( A.load(i ,j) * x1 );
1559 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1560 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1561 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1564 for( ; remainder && j<jend; ++j ) {
1565 y[i ] += A(i ,j) * x[j];
1566 y[i+1UL] += A(i+1UL,j) * x[j];
1567 y[i+2UL] += A(i+2UL,j) * x[j];
1568 y[i+3UL] += A(i+3UL,j) * x[j];
1572 for( ; (i+2UL) <= M; i+=2UL )
1574 const size_t jbegin( ( IsUpper_v<MT1> )
1575 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1577 const size_t jend( ( IsLower_v<MT1> )
1578 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1582 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1595 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1596 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1603 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1604 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1609 y[i ] +=
sum( A.load(i ,j) * x1 );
1610 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1613 for( ; remainder && j<jend; ++j ) {
1614 y[i ] += A(i ,j) * x[j];
1615 y[i+1UL] += A(i+1UL,j) * x[j];
1621 const size_t jbegin( ( IsUpper_v<MT1> )
1622 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1624 const size_t jend( ( IsLower_v<MT1> )
1625 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1629 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1642 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1649 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1654 y[i] +=
sum( A.load(i,j) * x1 );
1657 for( ; remainder && j<jend; ++j ) {
1658 y[i] += A(i,j) * x[j];
1679 template<
typename VT1
1682 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1683 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1685 selectLargeAddAssignKernel( y, A, x );
1691 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1705 template<
typename VT1
1708 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1709 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1711 using ET = ElementType_t<VT1>;
1713 if( IsTriangular_v<MT1> ) {
1714 ResultType_t<VT1> tmp(
serial( x ) );
1715 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1716 addAssign( y, tmp );
1719 gemv( y, A, x, ET(1), ET(1) );
1743 template<
typename VT1 >
1744 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1750 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1762 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1778 template<
typename VT1
1781 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1783 if( ( IsDiagonal_v<MT1> ) ||
1785 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1786 selectSmallSubAssignKernel( y, A, x );
1788 selectBlasSubAssignKernel( y, A, x );
1807 template<
typename VT1
1810 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1812 y.subAssign( A * x );
1831 template<
typename VT1
1834 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1835 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1837 selectDefaultSubAssignKernel( y, A, x );
1856 template<
typename VT1
1859 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1860 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1862 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1864 const size_t M( A.rows() );
1865 const size_t N( A.columns() );
1869 for( ; (i+8UL) <= M; i+=8UL )
1871 const size_t jbegin( ( IsUpper_v<MT1> )
1872 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1874 const size_t jend( ( IsLower_v<MT1> )
1875 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1879 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1882 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1887 xmm1 += A.load(i ,j) * x1;
1888 xmm2 += A.load(i+1UL,j) * x1;
1889 xmm3 += A.load(i+2UL,j) * x1;
1890 xmm4 += A.load(i+3UL,j) * x1;
1891 xmm5 += A.load(i+4UL,j) * x1;
1892 xmm6 += A.load(i+5UL,j) * x1;
1893 xmm7 += A.load(i+6UL,j) * x1;
1894 xmm8 += A.load(i+7UL,j) * x1;
1897 y[i ] -=
sum( xmm1 );
1898 y[i+1UL] -=
sum( xmm2 );
1899 y[i+2UL] -=
sum( xmm3 );
1900 y[i+3UL] -=
sum( xmm4 );
1901 y[i+4UL] -=
sum( xmm5 );
1902 y[i+5UL] -=
sum( xmm6 );
1903 y[i+6UL] -=
sum( xmm7 );
1904 y[i+7UL] -=
sum( xmm8 );
1906 for( ; remainder && j<jend; ++j ) {
1907 y[i ] -= A(i ,j) * x[j];
1908 y[i+1UL] -= A(i+1UL,j) * x[j];
1909 y[i+2UL] -= A(i+2UL,j) * x[j];
1910 y[i+3UL] -= A(i+3UL,j) * x[j];
1911 y[i+4UL] -= A(i+4UL,j) * x[j];
1912 y[i+5UL] -= A(i+5UL,j) * x[j];
1913 y[i+6UL] -= A(i+6UL,j) * x[j];
1914 y[i+7UL] -= A(i+7UL,j) * x[j];
1918 for( ; (i+4UL) <= M; i+=4UL )
1920 const size_t jbegin( ( IsUpper_v<MT1> )
1921 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1923 const size_t jend( ( IsLower_v<MT1> )
1924 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1928 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1936 xmm1 += A.load(i ,j) * x1;
1937 xmm2 += A.load(i+1UL,j) * x1;
1938 xmm3 += A.load(i+2UL,j) * x1;
1939 xmm4 += A.load(i+3UL,j) * x1;
1942 y[i ] -=
sum( xmm1 );
1943 y[i+1UL] -=
sum( xmm2 );
1944 y[i+2UL] -=
sum( xmm3 );
1945 y[i+3UL] -=
sum( xmm4 );
1947 for( ; remainder && j<jend; ++j ) {
1948 y[i ] -= A(i ,j) * x[j];
1949 y[i+1UL] -= A(i+1UL,j) * x[j];
1950 y[i+2UL] -= A(i+2UL,j) * x[j];
1951 y[i+3UL] -= A(i+3UL,j) * x[j];
1955 for( ; (i+3UL) <= M; i+=3UL )
1957 const size_t jbegin( ( IsUpper_v<MT1> )
1958 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1960 const size_t jend( ( IsLower_v<MT1> )
1961 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1965 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
1973 xmm1 += A.load(i ,j) * x1;
1974 xmm2 += A.load(i+1UL,j) * x1;
1975 xmm3 += A.load(i+2UL,j) * x1;
1978 y[i ] -=
sum( xmm1 );
1979 y[i+1UL] -=
sum( xmm2 );
1980 y[i+2UL] -=
sum( xmm3 );
1982 for( ; remainder && j<jend; ++j ) {
1983 y[i ] -= A(i ,j) * x[j];
1984 y[i+1UL] -= A(i+1UL,j) * x[j];
1985 y[i+2UL] -= A(i+2UL,j) * x[j];
1989 for( ; (i+2UL) <= M; i+=2UL )
1991 const size_t jbegin( ( IsUpper_v<MT1> )
1992 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
1994 const size_t jend( ( IsLower_v<MT1> )
1995 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1999 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2007 xmm1 += A.load(i ,j) * x1;
2008 xmm2 += A.load(i+1UL,j) * x1;
2011 y[i ] -=
sum( xmm1 );
2012 y[i+1UL] -=
sum( xmm2 );
2014 for( ; remainder && j<jend; ++j ) {
2015 y[i ] -= A(i ,j) * x[j];
2016 y[i+1UL] -= A(i+1UL,j) * x[j];
2022 const size_t jbegin( ( IsUpper_v<MT1> )
2023 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
2025 const size_t jend( ( IsLower_v<MT1> )
2026 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2030 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2037 xmm1 += A.load(i,j) * x.load(j);
2040 y[i] -=
sum( xmm1 );
2042 for( ; remainder && j<jend; ++j ) {
2043 y[i] -= A(i,j) * x[j];
2064 template<
typename VT1
2067 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2068 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2070 selectDefaultSubAssignKernel( y, A, x );
2089 template<
typename VT1
2092 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2093 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2095 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2097 const size_t M( A.rows() );
2098 const size_t N( A.columns() );
2102 for( ; (i+8UL) <= M; i+=8UL )
2104 const size_t jbegin( ( IsUpper_v<MT1> )
2105 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
2107 const size_t jend( ( IsLower_v<MT1> )
2108 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2112 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2125 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2126 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2127 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2128 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2129 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2130 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2131 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2132 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2139 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2140 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2141 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2142 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2143 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2144 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2145 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2146 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2151 y[i ] -=
sum( A.load(i ,j) * x1 );
2152 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2153 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2154 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2155 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2156 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2157 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2158 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2161 for( ; remainder && j<jend; ++j ) {
2162 y[i ] -= A(i ,j) * x[j];
2163 y[i+1UL] -= A(i+1UL,j) * x[j];
2164 y[i+2UL] -= A(i+2UL,j) * x[j];
2165 y[i+3UL] -= A(i+3UL,j) * x[j];
2166 y[i+4UL] -= A(i+4UL,j) * x[j];
2167 y[i+5UL] -= A(i+5UL,j) * x[j];
2168 y[i+6UL] -= A(i+6UL,j) * x[j];
2169 y[i+7UL] -= A(i+7UL,j) * x[j];
2173 for( ; (i+4UL) <= M; i+=4UL )
2175 const size_t jbegin( ( IsUpper_v<MT1> )
2176 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
2178 const size_t jend( ( IsLower_v<MT1> )
2179 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2183 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2196 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2197 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2198 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2199 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2206 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2207 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2208 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2209 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2214 y[i ] -=
sum( A.load(i ,j) * x1 );
2215 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2216 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2217 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2220 for( ; remainder && j<jend; ++j ) {
2221 y[i ] -= A(i ,j) * x[j];
2222 y[i+1UL] -= A(i+1UL,j) * x[j];
2223 y[i+2UL] -= A(i+2UL,j) * x[j];
2224 y[i+3UL] -= A(i+3UL,j) * x[j];
2228 for( ; (i+2UL) <= M; i+=2UL )
2230 const size_t jbegin( ( IsUpper_v<MT1> )
2231 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
2233 const size_t jend( ( IsLower_v<MT1> )
2234 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2238 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2251 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2252 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2259 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2260 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2265 y[i ] -=
sum( A.load(i ,j) * x1 );
2266 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2269 for( ; remainder && j<jend; ++j ) {
2270 y[i ] -= A(i ,j) * x[j];
2271 y[i+1UL] -= A(i+1UL,j) * x[j];
2277 const size_t jbegin( ( IsUpper_v<MT1> )
2278 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
2280 const size_t jend( ( IsLower_v<MT1> )
2281 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2285 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
2298 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2305 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2310 y[i] -=
sum( A.load(i,j) * x1 );
2313 for( ; remainder && j<jend; ++j ) {
2314 y[i] -= A(i,j) * x[j];
2335 template<
typename VT1
2338 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2339 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2341 selectLargeSubAssignKernel( y, A, x );
2347 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2361 template<
typename VT1
2364 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2365 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2367 using ET = ElementType_t<VT1>;
2369 if( IsTriangular_v<MT1> ) {
2370 ResultType_t<VT1> tmp(
serial( x ) );
2371 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2372 subAssign( y, tmp );
2375 gemv( y, A, x, ET(-1), ET(1) );
2399 template<
typename VT1 >
2400 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2411 multAssign( ~lhs, tmp );
2433 template<
typename VT1 >
2434 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2445 divAssign( ~lhs, tmp );
2469 template<
typename VT1 >
2471 -> EnableIf_t< UseSMPAssign_v<VT1> >
2477 if( rhs.mat_.rows() == 0UL ) {
2480 else if( rhs.mat_.columns() == 0UL ) {
2513 template<
typename VT1 >
2515 -> EnableIf_t< UseSMPAssign_v<VT1> >
2546 template<
typename VT1 >
2548 -> EnableIf_t< UseSMPAssign_v<VT1> >
2554 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2590 template<
typename VT1 >
2592 -> EnableIf_t< UseSMPAssign_v<VT1> >
2598 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2634 template<
typename VT1 >
2636 -> EnableIf_t< UseSMPAssign_v<VT1> >
2671 template<
typename VT1 >
2673 -> EnableIf_t< UseSMPAssign_v<VT1> >
2722 template<
typename MT
2725 class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
2726 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
2727 ,
private Computation
2731 using MVM = DMatDVecMultExpr<MT,VT>;
2732 using RES = ResultType_t<MVM>;
2733 using MRT = ResultType_t<MT>;
2734 using VRT = ResultType_t<VT>;
2735 using MET = ElementType_t<MRT>;
2736 using VET = ElementType_t<VRT>;
2737 using MCT = CompositeType_t<MT>;
2738 using VCT = CompositeType_t<VT>;
2743 static constexpr
bool evaluateMatrix =
2744 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2745 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2750 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<MT> );
2758 template<
typename T1 >
2759 static constexpr
bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
2766 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2767 static constexpr
bool UseBlasKernel_v =
2769 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2770 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2771 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2772 !IsDiagonal_v<T2> &&
2773 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2774 IsBLASCompatible_v< ElementType_t<T1> > &&
2775 IsBLASCompatible_v< ElementType_t<T2> > &&
2776 IsBLASCompatible_v< ElementType_t<T3> > &&
2777 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2778 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2779 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2787 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2788 static constexpr
bool UseVectorizedDefaultKernel_v =
2789 ( useOptimizedKernels &&
2790 !IsDiagonal_v<T2> &&
2791 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2792 IsSIMDCombinable_v< ElementType_t<T1>
2796 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2797 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2802 using This = DVecScalarMultExpr<MVM,ST,false>;
2803 using BaseType = DenseVector<This,false>;
2807 using SIMDType = SIMDTrait_t<ElementType>;
2812 using LeftOperand =
const DMatDVecMultExpr<MT,VT>;
2818 using LT = If_t< evaluateMatrix, const MRT, MCT >;
2821 using RT = If_t< evaluateVector, const VRT, VCT >;
2827 ( !IsDiagonal_v<MT> &&
2828 MT::simdEnabled && VT::simdEnabled &&
2829 IsSIMDCombinable_v<MET,VET,ST> &&
2830 HasSIMDAdd_v<MET,VET> &&
2831 HasSIMDMult_v<MET,VET> );
2875 if( index >=
vector_.size() ) {
2878 return (*
this)[index];
2887 inline size_t size()
const {
2918 template<
typename T >
2919 inline bool canAlias(
const T* alias )
const {
2920 return vector_.canAlias( alias );
2930 template<
typename T >
2931 inline bool isAliased(
const T* alias )
const {
2932 return vector_.isAliased( alias );
2952 LeftOperand_t<MVM> A(
vector_.leftOperand() );
2956 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2957 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2958 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2980 template<
typename VT1 >
2981 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2987 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
2988 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
2990 if( left.rows() == 0UL ) {
2993 else if( left.columns() == 0UL ) {
3006 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3021 template<
typename VT1
3025 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3027 if( ( IsDiagonal_v<MT1> ) ||
3028 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3029 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3030 selectSmallAssignKernel( y, A, x, scalar );
3032 selectBlasAssignKernel( y, A, x, scalar );
3050 template<
typename VT1
3054 static inline auto selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3055 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3057 y.assign( A * x * scalar );
3075 template<
typename VT1
3079 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3080 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3082 selectDefaultAssignKernel( y, A, x, scalar );
3100 template<
typename VT1
3104 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3105 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3107 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3109 const size_t M( A.rows() );
3110 const size_t N( A.columns() );
3114 for( ; (i+8UL) <= M; i+=8UL )
3116 const size_t jbegin( ( IsUpper_v<MT1> )
3117 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3119 const size_t jend( ( IsLower_v<MT1> )
3120 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3124 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3127 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3131 const SIMDType x1( x.load(j) );
3132 xmm1 += A.load(i ,j) * x1;
3133 xmm2 += A.load(i+1UL,j) * x1;
3134 xmm3 += A.load(i+2UL,j) * x1;
3135 xmm4 += A.load(i+3UL,j) * x1;
3136 xmm5 += A.load(i+4UL,j) * x1;
3137 xmm6 += A.load(i+5UL,j) * x1;
3138 xmm7 += A.load(i+6UL,j) * x1;
3139 xmm8 += A.load(i+7UL,j) * x1;
3142 y[i ] =
sum( xmm1 ) * scalar;
3143 y[i+1UL] =
sum( xmm2 ) * scalar;
3144 y[i+2UL] =
sum( xmm3 ) * scalar;
3145 y[i+3UL] =
sum( xmm4 ) * scalar;
3146 y[i+4UL] =
sum( xmm5 ) * scalar;
3147 y[i+5UL] =
sum( xmm6 ) * scalar;
3148 y[i+6UL] =
sum( xmm7 ) * scalar;
3149 y[i+7UL] =
sum( xmm8 ) * scalar;
3151 for( ; remainder && j<jend; ++j ) {
3152 y[i ] += A(i ,j) * x[j] * scalar;
3153 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3154 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3155 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3156 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3157 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3158 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3159 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3163 for( ; (i+4UL) <= M; i+=4UL )
3165 const size_t jbegin( ( IsUpper_v<MT1> )
3166 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3168 const size_t jend( ( IsLower_v<MT1> )
3169 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3173 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3176 SIMDType xmm1, xmm2, xmm3, xmm4;
3180 const SIMDType x1( x.load(j) );
3181 xmm1 += A.load(i ,j) * x1;
3182 xmm2 += A.load(i+1UL,j) * x1;
3183 xmm3 += A.load(i+2UL,j) * x1;
3184 xmm4 += A.load(i+3UL,j) * x1;
3187 y[i ] =
sum( xmm1 ) * scalar;
3188 y[i+1UL] =
sum( xmm2 ) * scalar;
3189 y[i+2UL] =
sum( xmm3 ) * scalar;
3190 y[i+3UL] =
sum( xmm4 ) * scalar;
3192 for( ; remainder && j<jend; ++j ) {
3193 y[i ] += A(i ,j) * x[j] * scalar;
3194 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3195 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3196 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3200 for( ; (i+3UL) <= M; i+=3UL )
3202 const size_t jbegin( ( IsUpper_v<MT1> )
3203 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3205 const size_t jend( ( IsLower_v<MT1> )
3206 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3210 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3213 SIMDType xmm1, xmm2, xmm3;
3217 const SIMDType x1( x.load(j) );
3218 xmm1 += A.load(i ,j) * x1;
3219 xmm2 += A.load(i+1UL,j) * x1;
3220 xmm3 += A.load(i+2UL,j) * x1;
3223 y[i ] =
sum( xmm1 ) * scalar;
3224 y[i+1UL] =
sum( xmm2 ) * scalar;
3225 y[i+2UL] =
sum( xmm3 ) * scalar;
3227 for( ; remainder && j<jend; ++j ) {
3228 y[i ] += A(i ,j) * x[j] * scalar;
3229 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3230 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3234 for( ; (i+2UL) <= M; i+=2UL )
3236 const size_t jbegin( ( IsUpper_v<MT1> )
3237 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3239 const size_t jend( ( IsLower_v<MT1> )
3240 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3244 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3247 SIMDType xmm1, xmm2;
3251 const SIMDType x1( x.load(j) );
3252 xmm1 += A.load(i ,j) * x1;
3253 xmm2 += A.load(i+1UL,j) * x1;
3256 y[i ] =
sum( xmm1 ) * scalar;
3257 y[i+1UL] =
sum( xmm2 ) * scalar;
3259 for( ; remainder && j<jend; ++j ) {
3260 y[i ] += A(i ,j) * x[j] * scalar;
3261 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3267 const size_t jbegin( ( IsUpper_v<MT1> )
3268 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3270 const size_t jend( ( IsLower_v<MT1> )
3271 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3275 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3282 xmm1 += A.load(i,j) * x.load(j);
3285 y[i] =
sum( xmm1 ) * scalar;
3287 for( ; remainder && j<jend; ++j ) {
3288 y[i] += A(i,j) * x[j] * scalar;
3308 template<
typename VT1
3312 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3313 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3315 selectDefaultAssignKernel( y, A, x, scalar );
3333 template<
typename VT1
3337 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3338 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3340 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3342 const size_t M( A.rows() );
3343 const size_t N( A.columns() );
3349 for( ; (i+8UL) <= M; i+=8UL )
3351 const size_t jbegin( ( IsUpper_v<MT1> )
3352 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3354 const size_t jend( ( IsLower_v<MT1> )
3355 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3359 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3368 const SIMDType x1( x.load(j ) );
3369 const SIMDType x2( x.load(j1) );
3370 const SIMDType x3( x.load(j2) );
3371 const SIMDType x4( x.load(j3) );
3372 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3373 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3374 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3375 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3376 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3377 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3378 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3379 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3384 const SIMDType x1( x.load(j ) );
3385 const SIMDType x2( x.load(j1) );
3386 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3387 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3388 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3389 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3390 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3391 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3392 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3393 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3397 const SIMDType x1( x.load(j) );
3398 y[i ] +=
sum( A.load(i ,j) * x1 );
3399 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3400 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3401 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3402 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3403 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3404 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3405 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3408 for( ; remainder && j<jend; ++j ) {
3409 y[i ] += A(i ,j) * x[j];
3410 y[i+1UL] += A(i+1UL,j) * x[j];
3411 y[i+2UL] += A(i+2UL,j) * x[j];
3412 y[i+3UL] += A(i+3UL,j) * x[j];
3413 y[i+4UL] += A(i+4UL,j) * x[j];
3414 y[i+5UL] += A(i+5UL,j) * x[j];
3415 y[i+6UL] += A(i+6UL,j) * x[j];
3416 y[i+7UL] += A(i+7UL,j) * x[j];
3429 for( ; (i+4UL) <= M; i+=4UL )
3431 const size_t jbegin( ( IsUpper_v<MT1> )
3432 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3434 const size_t jend( ( IsLower_v<MT1> )
3435 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3439 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3448 const SIMDType x1( x.load(j ) );
3449 const SIMDType x2( x.load(j1) );
3450 const SIMDType x3( x.load(j2) );
3451 const SIMDType x4( x.load(j3) );
3452 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3454 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3455 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3460 const SIMDType x1( x.load(j ) );
3461 const SIMDType x2( x.load(j1) );
3462 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3463 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3464 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3465 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3469 const SIMDType x1( x.load(j) );
3470 y[i ] +=
sum( A.load(i ,j) * x1 );
3471 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3472 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3473 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3476 for( ; remainder && j<jend; ++j ) {
3477 y[i ] += A(i ,j) * x[j];
3478 y[i+1UL] += A(i+1UL,j) * x[j];
3479 y[i+2UL] += A(i+2UL,j) * x[j];
3480 y[i+3UL] += A(i+3UL,j) * x[j];
3489 for( ; (i+2UL) <= M; i+=2UL )
3491 const size_t jbegin( ( IsUpper_v<MT1> )
3492 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3494 const size_t jend( ( IsLower_v<MT1> )
3495 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3499 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3508 const SIMDType x1( x.load(j ) );
3509 const SIMDType x2( x.load(j1) );
3510 const SIMDType x3( x.load(j2) );
3511 const SIMDType x4( x.load(j3) );
3512 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3513 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3518 const SIMDType x1( x.load(j ) );
3519 const SIMDType x2( x.load(j1) );
3520 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3521 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3525 const SIMDType x1( x.load(j) );
3526 y[i ] +=
sum( A.load(i ,j) * x1 );
3527 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3530 for( ; remainder && j<jend; ++j ) {
3531 y[i ] += A(i ,j) * x[j];
3532 y[i+1UL] += A(i+1UL,j) * x[j];
3541 const size_t jbegin( ( IsUpper_v<MT1> )
3542 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3544 const size_t jend( ( IsLower_v<MT1> )
3545 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3549 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3558 const SIMDType x1( x.load(j ) );
3559 const SIMDType x2( x.load(j1) );
3560 const SIMDType x3( x.load(j2) );
3561 const SIMDType x4( x.load(j3) );
3562 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3567 const SIMDType x1( x.load(j ) );
3568 const SIMDType x2( x.load(j1) );
3569 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3573 const SIMDType x1( x.load(j) );
3574 y[i] +=
sum( A.load(i,j) * x1 );
3577 for( ; remainder && j<jend; ++j ) {
3578 y[i] += A(i,j) * x[j];
3600 template<
typename VT1
3604 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3605 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3607 selectLargeAssignKernel( y, A, x, scalar );
3612 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3626 template<
typename VT1
3630 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3631 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
3633 using ET = ElementType_t<VT1>;
3635 if( IsTriangular_v<MT1> ) {
3636 assign( y, scalar * x );
3637 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3640 gemv( y, A, x,
ET(scalar),
ET(0) );
3658 template<
typename VT1 >
3659 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3670 assign( ~lhs, tmp );
3686 template<
typename VT1 >
3687 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3693 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3694 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3696 if( left.rows() == 0UL || left.columns() == 0UL ) {
3708 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3723 template<
typename VT1
3727 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3729 if( ( IsDiagonal_v<MT1> ) ||
3730 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3731 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3732 selectSmallAddAssignKernel( y, A, x, scalar );
3734 selectBlasAddAssignKernel( y, A, x, scalar );
3752 template<
typename VT1
3756 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3758 y.addAssign( A * x * scalar );
3776 template<
typename VT1
3780 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3781 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3783 selectDefaultAddAssignKernel( y, A, x, scalar );
3801 template<
typename VT1
3805 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3806 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3808 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3810 const size_t M( A.rows() );
3811 const size_t N( A.columns() );
3815 for( ; (i+8UL) <= M; i+=8UL )
3817 const size_t jbegin( ( IsUpper_v<MT1> )
3818 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3820 const size_t jend( ( IsLower_v<MT1> )
3821 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3825 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3828 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3832 const SIMDType x1( x.load(j) );
3833 xmm1 += A.load(i ,j) * x1;
3834 xmm2 += A.load(i+1UL,j) * x1;
3835 xmm3 += A.load(i+2UL,j) * x1;
3836 xmm4 += A.load(i+3UL,j) * x1;
3837 xmm5 += A.load(i+4UL,j) * x1;
3838 xmm6 += A.load(i+5UL,j) * x1;
3839 xmm7 += A.load(i+6UL,j) * x1;
3840 xmm8 += A.load(i+7UL,j) * x1;
3843 y[i ] +=
sum( xmm1 ) * scalar;
3844 y[i+1UL] +=
sum( xmm2 ) * scalar;
3845 y[i+2UL] +=
sum( xmm3 ) * scalar;
3846 y[i+3UL] +=
sum( xmm4 ) * scalar;
3847 y[i+4UL] +=
sum( xmm5 ) * scalar;
3848 y[i+5UL] +=
sum( xmm6 ) * scalar;
3849 y[i+6UL] +=
sum( xmm7 ) * scalar;
3850 y[i+7UL] +=
sum( xmm8 ) * scalar;
3852 for( ; remainder && j<jend; ++j ) {
3853 y[i ] += A(i ,j) * x[j] * scalar;
3854 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3855 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3856 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3857 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3858 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3859 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3860 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3864 for( ; (i+4UL) <= M; i+=4UL )
3866 const size_t jbegin( ( IsUpper_v<MT1> )
3867 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3869 const size_t jend( ( IsLower_v<MT1> )
3870 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3874 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3877 SIMDType xmm1, xmm2, xmm3, xmm4;
3881 const SIMDType x1( x.load(j) );
3882 xmm1 += A.load(i ,j) * x1;
3883 xmm2 += A.load(i+1UL,j) * x1;
3884 xmm3 += A.load(i+2UL,j) * x1;
3885 xmm4 += A.load(i+3UL,j) * x1;
3888 y[i ] +=
sum( xmm1 ) * scalar;
3889 y[i+1UL] +=
sum( xmm2 ) * scalar;
3890 y[i+2UL] +=
sum( xmm3 ) * scalar;
3891 y[i+3UL] +=
sum( xmm4 ) * scalar;
3893 for( ; remainder && j<jend; ++j ) {
3894 y[i ] += A(i ,j) * x[j] * scalar;
3895 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3896 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3897 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3901 for( ; (i+3UL) <= M; i+=3UL )
3903 const size_t jbegin( ( IsUpper_v<MT1> )
3904 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3906 const size_t jend( ( IsLower_v<MT1> )
3907 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3911 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3914 SIMDType xmm1, xmm2, xmm3;
3918 const SIMDType x1( x.load(j) );
3919 xmm1 += A.load(i ,j) * x1;
3920 xmm2 += A.load(i+1UL,j) * x1;
3921 xmm3 += A.load(i+2UL,j) * x1;
3924 y[i ] +=
sum( xmm1 ) * scalar;
3925 y[i+1UL] +=
sum( xmm2 ) * scalar;
3926 y[i+2UL] +=
sum( xmm3 ) * scalar;
3928 for( ; remainder && j<jend; ++j ) {
3929 y[i ] += A(i ,j) * x[j] * scalar;
3930 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3931 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3935 for( ; (i+2UL) <= M; i+=2UL )
3937 const size_t jbegin( ( IsUpper_v<MT1> )
3938 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3940 const size_t jend( ( IsLower_v<MT1> )
3941 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3945 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3948 SIMDType xmm1, xmm2;
3952 const SIMDType x1( x.load(j) );
3953 xmm1 += A.load(i ,j) * x1;
3954 xmm2 += A.load(i+1UL,j) * x1;
3957 y[i ] +=
sum( xmm1 ) * scalar;
3958 y[i+1UL] +=
sum( xmm2 ) * scalar;
3960 for( ; remainder && j<jend; ++j ) {
3961 y[i ] += A(i ,j) * x[j] * scalar;
3962 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3968 const size_t jbegin( ( IsUpper_v<MT1> )
3969 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
3971 const size_t jend( ( IsLower_v<MT1> )
3972 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3976 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
3983 xmm1 += A.load(i,j) * x.load(j);
3986 y[i] +=
sum( xmm1 ) * scalar;
3988 for( ; remainder && j<jend; ++j ) {
3989 y[i] += A(i,j) * x[j] * scalar;
4009 template<
typename VT1
4013 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4014 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4016 selectDefaultAddAssignKernel( y, A, x, scalar );
4034 template<
typename VT1
4038 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4039 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4041 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4043 const size_t M( A.rows() );
4044 const size_t N( A.columns() );
4048 for( ; (i+8UL) <= M; i+=8UL )
4050 const size_t jbegin( ( IsUpper_v<MT1> )
4051 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4053 const size_t jend( ( IsLower_v<MT1> )
4054 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4058 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4067 const SIMDType x1( x.load(j ) );
4068 const SIMDType x2( x.load(j1) );
4069 const SIMDType x3( x.load(j2) );
4070 const SIMDType x4( x.load(j3) );
4071 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4072 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4073 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4074 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4075 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4076 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4077 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4078 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4083 const SIMDType x1( x.load(j ) );
4084 const SIMDType x2( x.load(j1) );
4085 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4086 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4087 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4088 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4089 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4090 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4091 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4092 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4096 const SIMDType x1( x.load(j) );
4097 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4098 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4099 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4100 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4101 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4102 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4103 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4104 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4107 for( ; remainder && j<jend; ++j ) {
4108 y[i ] += A(i ,j) * x[j] * scalar;
4109 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4110 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4111 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4112 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4113 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4114 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4115 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4119 for( ; (i+4UL) <= M; i+=4UL )
4121 const size_t jbegin( ( IsUpper_v<MT1> )
4122 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4124 const size_t jend( ( IsLower_v<MT1> )
4125 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4129 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4138 const SIMDType x1( x.load(j ) );
4139 const SIMDType x2( x.load(j1) );
4140 const SIMDType x3( x.load(j2) );
4141 const SIMDType x4( x.load(j3) );
4142 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4143 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4144 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4145 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4150 const SIMDType x1( x.load(j ) );
4151 const SIMDType x2( x.load(j1) );
4152 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4153 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4154 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4155 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4159 const SIMDType x1( x.load(j) );
4160 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4161 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4162 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4163 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4166 for( ; remainder && j<jend; ++j ) {
4167 y[i ] += A(i ,j) * x[j] * scalar;
4168 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4169 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4170 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4174 for( ; (i+2UL) <= M; i+=2UL )
4176 const size_t jbegin( ( IsUpper_v<MT1> )
4177 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4179 const size_t jend( ( IsLower_v<MT1> )
4180 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4184 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4193 const SIMDType x1( x.load(j ) );
4194 const SIMDType x2( x.load(j1) );
4195 const SIMDType x3( x.load(j2) );
4196 const SIMDType x4( x.load(j3) );
4197 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4198 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4203 const SIMDType x1( x.load(j ) );
4204 const SIMDType x2( x.load(j1) );
4205 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4206 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4210 const SIMDType x1( x.load(j) );
4211 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4212 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4215 for( ; remainder && j<jend; ++j ) {
4216 y[i ] += A(i ,j) * x[j] * scalar;
4217 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4223 const size_t jbegin( ( IsUpper_v<MT1> )
4224 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4226 const size_t jend( ( IsLower_v<MT1> )
4227 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4231 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4240 const SIMDType x1( x.load(j ) );
4241 const SIMDType x2( x.load(j1) );
4242 const SIMDType x3( x.load(j2) );
4243 const SIMDType x4( x.load(j3) );
4244 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4249 const SIMDType x1( x.load(j ) );
4250 const SIMDType x2( x.load(j1) );
4251 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4255 const SIMDType x1( x.load(j) );
4256 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4259 for( ; remainder && j<jend; ++j ) {
4260 y[i] += A(i,j) * x[j] * scalar;
4280 template<
typename VT1
4284 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4285 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4287 selectLargeAddAssignKernel( y, A, x, scalar );
4292 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4306 template<
typename VT1
4310 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4311 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4313 using ET = ElementType_t<VT1>;
4315 if( IsTriangular_v<MT1> ) {
4316 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4317 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4318 addAssign( y, tmp );
4321 gemv( y, A, x,
ET(scalar),
ET(1) );
4343 template<
typename VT1 >
4344 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4350 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4351 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4353 if( left.rows() == 0UL || left.columns() == 0UL ) {
4365 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4380 template<
typename VT1
4384 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4386 if( ( IsDiagonal_v<MT1> ) ||
4387 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4388 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4389 selectSmallSubAssignKernel( y, A, x, scalar );
4391 selectBlasSubAssignKernel( y, A, x, scalar );
4409 template<
typename VT1
4413 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4415 y.subAssign( A * x * scalar );
4433 template<
typename VT1
4437 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4438 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4440 selectDefaultSubAssignKernel( y, A, x, scalar );
4458 template<
typename VT1
4462 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4463 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4465 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4467 const size_t M( A.rows() );
4468 const size_t N( A.columns() );
4472 for( ; (i+8UL) <= M; i+=8UL )
4474 const size_t jbegin( ( IsUpper_v<MT1> )
4475 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4477 const size_t jend( ( IsLower_v<MT1> )
4478 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4482 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4485 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4489 const SIMDType x1( x.load(j) );
4490 xmm1 += A.load(i ,j) * x1;
4491 xmm2 += A.load(i+1UL,j) * x1;
4492 xmm3 += A.load(i+2UL,j) * x1;
4493 xmm4 += A.load(i+3UL,j) * x1;
4494 xmm5 += A.load(i+4UL,j) * x1;
4495 xmm6 += A.load(i+5UL,j) * x1;
4496 xmm7 += A.load(i+6UL,j) * x1;
4497 xmm8 += A.load(i+7UL,j) * x1;
4500 y[i ] -=
sum( xmm1 ) * scalar;
4501 y[i+1UL] -=
sum( xmm2 ) * scalar;
4502 y[i+2UL] -=
sum( xmm3 ) * scalar;
4503 y[i+3UL] -=
sum( xmm4 ) * scalar;
4504 y[i+4UL] -=
sum( xmm5 ) * scalar;
4505 y[i+5UL] -=
sum( xmm6 ) * scalar;
4506 y[i+6UL] -=
sum( xmm7 ) * scalar;
4507 y[i+7UL] -=
sum( xmm8 ) * scalar;
4509 for( ; remainder && j<jend; ++j ) {
4510 y[i ] -= A(i ,j) * x[j] * scalar;
4511 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4512 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4513 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4514 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4515 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4516 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4517 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4521 for( ; (i+4UL) <= M; i+=4UL )
4523 const size_t jbegin( ( IsUpper_v<MT1> )
4524 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4526 const size_t jend( ( IsLower_v<MT1> )
4527 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4531 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4534 SIMDType xmm1, xmm2, xmm3, xmm4;
4538 const SIMDType x1( x.load(j) );
4539 xmm1 += A.load(i ,j) * x1;
4540 xmm2 += A.load(i+1UL,j) * x1;
4541 xmm3 += A.load(i+2UL,j) * x1;
4542 xmm4 += A.load(i+3UL,j) * x1;
4545 y[i ] -=
sum( xmm1 ) * scalar;
4546 y[i+1UL] -=
sum( xmm2 ) * scalar;
4547 y[i+2UL] -=
sum( xmm3 ) * scalar;
4548 y[i+3UL] -=
sum( xmm4 ) * scalar;
4550 for( ; remainder && j<jend; ++j ) {
4551 y[i ] -= A(i ,j) * x[j] * scalar;
4552 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4553 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4554 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4558 for( ; (i+3UL) <= M; i+=3UL )
4560 const size_t jbegin( ( IsUpper_v<MT1> )
4561 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4563 const size_t jend( ( IsLower_v<MT1> )
4564 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
4568 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4571 SIMDType xmm1, xmm2, xmm3;
4575 const SIMDType x1( x.load(j) );
4576 xmm1 += A.load(i ,j) * x1;
4577 xmm2 += A.load(i+1UL,j) * x1;
4578 xmm3 += A.load(i+2UL,j) * x1;
4581 y[i ] -=
sum( xmm1 ) * scalar;
4582 y[i+1UL] -=
sum( xmm2 ) * scalar;
4583 y[i+2UL] -=
sum( xmm3 ) * scalar;
4585 for( ; remainder && j<jend; ++j ) {
4586 y[i ] -= A(i ,j) * x[j] * scalar;
4587 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4588 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4592 for( ; (i+2UL) <= M; i+=2UL )
4594 const size_t jbegin( ( IsUpper_v<MT1> )
4595 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4597 const size_t jend( ( IsLower_v<MT1> )
4598 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4602 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4605 SIMDType xmm1, xmm2;
4609 const SIMDType x1( x.load(j) );
4610 xmm1 += A.load(i ,j) * x1;
4611 xmm2 += A.load(i+1UL,j) * x1;
4614 y[i ] -=
sum( xmm1 ) * scalar;
4615 y[i+1UL] -=
sum( xmm2 ) * scalar;
4617 for( ; remainder && j<jend; ++j ) {
4618 y[i ] -= A(i ,j) * x[j] * scalar;
4619 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4625 const size_t jbegin( ( IsUpper_v<MT1> )
4626 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4628 const size_t jend( ( IsLower_v<MT1> )
4629 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4633 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4640 xmm1 += A.load(i,j) * x.load(j);
4643 y[i] -=
sum( xmm1 ) * scalar;
4645 for( ; remainder && j<jend; ++j ) {
4646 y[i] -= A(i,j) * x[j] * scalar;
4666 template<
typename VT1
4670 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4671 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4673 selectDefaultSubAssignKernel( y, A, x, scalar );
4691 template<
typename VT1
4695 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4696 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4698 constexpr
bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4700 const size_t M( A.rows() );
4701 const size_t N( A.columns() );
4705 for( ; (i+8UL) <= M; i+=8UL )
4707 const size_t jbegin( ( IsUpper_v<MT1> )
4708 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4710 const size_t jend( ( IsLower_v<MT1> )
4711 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4715 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4724 const SIMDType x1( x.load(j ) );
4725 const SIMDType x2( x.load(j1) );
4726 const SIMDType x3( x.load(j2) );
4727 const SIMDType x4( x.load(j3) );
4728 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4729 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4730 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4731 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4732 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4733 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4734 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4735 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4740 const SIMDType x1( x.load(j ) );
4741 const SIMDType x2( x.load(j1) );
4742 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4743 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4744 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4745 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4746 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4747 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4748 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4749 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4753 const SIMDType x1( x.load(j) );
4754 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4755 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4756 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4757 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4758 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4759 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4760 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4761 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4764 for( ; remainder && j<jend; ++j ) {
4765 y[i ] -= A(i ,j) * x[j] * scalar;
4766 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4767 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4768 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4769 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4770 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4771 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4772 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4776 for( ; (i+4UL) <= M; i+=4UL )
4778 const size_t jbegin( ( IsUpper_v<MT1> )
4779 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4781 const size_t jend( ( IsLower_v<MT1> )
4782 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4786 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4795 const SIMDType x1( x.load(j ) );
4796 const SIMDType x2( x.load(j1) );
4797 const SIMDType x3( x.load(j2) );
4798 const SIMDType x4( x.load(j3) );
4799 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4800 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4801 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4802 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4807 const SIMDType x1( x.load(j ) );
4808 const SIMDType x2( x.load(j1) );
4809 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4810 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4811 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4812 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4816 const SIMDType x1( x.load(j) );
4817 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4818 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4819 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4820 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4823 for( ; remainder && j<jend; ++j ) {
4824 y[i ] -= A(i ,j) * x[j] * scalar;
4825 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4826 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4827 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4831 for( ; (i+2UL) <= M; i+=2UL )
4833 const size_t jbegin( ( IsUpper_v<MT1> )
4834 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4836 const size_t jend( ( IsLower_v<MT1> )
4837 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4841 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4850 const SIMDType x1( x.load(j ) );
4851 const SIMDType x2( x.load(j1) );
4852 const SIMDType x3( x.load(j2) );
4853 const SIMDType x4( x.load(j3) );
4854 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4855 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4860 const SIMDType x1( x.load(j ) );
4861 const SIMDType x2( x.load(j1) );
4862 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4863 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4867 const SIMDType x1( x.load(j) );
4868 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4869 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4872 for( ; remainder && j<jend; ++j ) {
4873 y[i ] -= A(i ,j) * x[j] * scalar;
4874 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4880 const size_t jbegin( ( IsUpper_v<MT1> )
4881 ?( ( IsStrictlyUpper_v<MT1> ? i+1UL : i ) &
size_t(-
SIMDSIZE) )
4883 const size_t jend( ( IsLower_v<MT1> )
4884 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4888 const size_t jpos( remainder ? ( jend &
size_t(-
SIMDSIZE) ) : jend );
4897 const SIMDType x1( x.load(j ) );
4898 const SIMDType x2( x.load(j1) );
4899 const SIMDType x3( x.load(j2) );
4900 const SIMDType x4( x.load(j3) );
4901 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4906 const SIMDType x1( x.load(j ) );
4907 const SIMDType x2( x.load(j1) );
4908 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4912 const SIMDType x1( x.load(j) );
4913 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4916 for( ; remainder && j<jend; ++j ) {
4917 y[i] -= A(i,j) * x[j] * scalar;
4937 template<
typename VT1
4941 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4942 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4944 selectLargeSubAssignKernel( y, A, x, scalar );
4949 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4963 template<
typename VT1
4967 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4968 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4970 using ET = ElementType_t<VT1>;
4972 if( IsTriangular_v<MT1> ) {
4973 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4974 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4975 subAssign( y, tmp );
4978 gemv( y, A, x,
ET(-scalar),
ET(1) );
5000 template<
typename VT1 >
5001 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5012 multAssign( ~lhs, tmp );
5032 template<
typename VT1 >
5033 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5044 divAssign( ~lhs, tmp );
5066 template<
typename VT1 >
5068 -> EnableIf_t< UseSMPAssign_v<VT1> >
5074 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5075 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5077 if( left.rows() == 0UL ) {
5080 else if( left.columns() == 0UL ) {
5111 template<
typename VT1 >
5113 -> EnableIf_t< UseSMPAssign_v<VT1> >
5142 template<
typename VT1 >
5144 -> EnableIf_t< UseSMPAssign_v<VT1> >
5150 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5151 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5153 if( left.rows() == 0UL || left.columns() == 0UL ) {
5187 template<
typename VT1 >
5189 -> EnableIf_t< UseSMPAssign_v<VT1> >
5195 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5196 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5198 if( left.rows() == 0UL || left.columns() == 0UL ) {
5232 template<
typename VT1 >
5234 -> EnableIf_t< UseSMPAssign_v<VT1> >
5267 template<
typename VT1 >
5269 -> EnableIf_t< UseSMPAssign_v<VT1> >
5341 template<
typename MT
5343 inline decltype(
auto)
5382 template<
typename MT
5384 inline decltype(
auto)
5385 operator*( const MatMatMultExpr<MT>& mat, const DenseVector<VT,false>& vec )
5389 return (~mat).leftOperand() * ( (~mat).rightOperand() * vec );
5405 template<
typename MT,
typename VT >
5406 struct IsAligned< DMatDVecMultExpr<MT,VT> >
5407 :
public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:329
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:567
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:523
Header file for auxiliary alias declarations.
Header file for the blaze::checked and blaze::unchecked instances.
If_t< IsExpression_v< MT >, const MT, const MT &> LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:210
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:163
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Header file for basic type definitions.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:204
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:260
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias declaration for the If class template.The If_t alias declaration provides a convenien...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:533
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:166
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:144
Header file for the serial shim.
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:121
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:202
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:216
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:591
static constexpr bool smpAssignable
Compilation flag for SMP assignments.
Definition: CompressedMatrix.h:3113
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:428
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:513
Header file for the DenseVector base class.
If_t< useAssign, const ResultType, const DVecScalarMultExpr &> CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:169
Header file for the Computation base class.
Header file for the MatMatMultExpr base class.
Header file for the reset shim.
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:339
System settings for performance optimizations.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:351
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
constexpr size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:514
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:80
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:433
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:371
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:467
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:159
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:307
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2146
Header file for the HasSIMDAdd type trait.
Header file for all SIMD functionality.
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:107
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:67
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
If_t< IsExpression_v< VT >, const VT, const VT &> RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:213
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:104
Constraint on the data type.
Header file for the exception macros of the math module.
If_t< IsExpression_v< VT >, const VT, const VT &> LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:172
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:327
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:584
Header file for all forward declarations for expression class templates.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:203
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:131
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:557
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:207
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDVecMultExpr.h:237
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
System settings for the BLAS mode.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:454
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Header file for the IsSIMDCombinable type trait.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:361
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:219
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:577
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDVecMultExpr.h:231
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:246
Header file for the IsContiguous type trait.
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:133
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:161
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:101
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:109
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:132
Header file for BLAS general matrix/vector multiplication functions (gemv)
const Type & ReturnType
Return type for expression template evaluations.
Definition: CompressedMatrix.h:3081
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:545
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:64
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:66
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:442
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:205
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDVecMultExpr.h:224
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Constraint on the data type.
Header file for the complex data type.
Header file for the IsUpper type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:317
Header file for the MatVecMultExpr base class.
Constraint on the data type.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:294
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:206
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:191
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:175
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:423