35#ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
125 :
public MatVecMultExpr< DenseVector< DMatDVecMultExpr<MT,VT>, false > >
141 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
142 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
147 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
156 template<
typename T1 >
166 template<
typename T1,
typename T2,
typename T3 >
167 static constexpr bool UseBlasKernel_v =
169 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
170 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
171 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
173 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
174 IsBLASCompatible_v< ElementType_t<T1> > &&
175 IsBLASCompatible_v< ElementType_t<T2> > &&
176 IsBLASCompatible_v< ElementType_t<T3> > &&
188 template<
typename T1,
typename T2,
typename T3 >
189 static constexpr bool UseVectorizedDefaultKernel_v =
190 ( useOptimizedKernels &&
192 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
193 IsSIMDCombinable_v< ElementType_t<T1>
232 ( !IsDiagonal_v<MT> &&
233 MT::simdEnabled && VT::simdEnabled &&
234 HasSIMDAdd_v<MET,VET> &&
235 HasSIMDMult_v<MET,VET> );
270 if( IsDiagonal_v<MT> )
272 return mat_(index,index) *
vec_[index];
274 else if( IsLower_v<MT> && ( index + 8UL <
mat_.rows() ) )
276 const size_t n( IsStrictlyLower_v<MT> ? index : index+1UL );
280 else if( IsUpper_v<MT> && ( index > 8UL ) )
282 const size_t begin( IsStrictlyUpper_v<MT> ? index+1UL : index );
283 const size_t n (
mat_.columns() -
begin );
302 if( index >=
mat_.rows() ) {
305 return (*
this)[index];
314 inline size_t size() const noexcept {
345 template<
typename T >
346 inline bool canAlias(
const T* alias )
const noexcept {
347 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
357 template<
typename T >
358 inline bool isAliased(
const T* alias )
const noexcept {
359 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
369 return mat_.isAligned() &&
vec_.isAligned();
383 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
384 (
size() > SMP_DMATDVECMULT_THRESHOLD );
407 template<
typename VT1 >
414 if( rhs.
mat_.rows() == 0UL ) {
417 else if( rhs.
mat_.columns() == 0UL ||
418 ( IsStrictlyTriangular_v<MT> && rhs.
mat_.columns() == 1UL ) ) {
431 DMatDVecMultExpr::selectAssignKernel( *lhs, A, x );
447 template<
typename VT1
450 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
452 if( ( IsDiagonal_v<MT1> ) ||
454 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
455 selectSmallAssignKernel( y, A, x );
457 selectBlasAssignKernel( y, A, x );
476 template<
typename VT1
479 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
500 template<
typename VT1
503 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
504 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
506 selectDefaultAssignKernel( y, A, x );
525 template<
typename VT1
528 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
529 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
531 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
533 const size_t M( A.rows() );
534 const size_t N( A.columns() );
538 for( ; (i+8UL) <= M; i+=8UL )
540 const size_t jbegin( ( IsUpper_v<MT1> )
543 const size_t jend( ( IsLower_v<MT1> )
544 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
557 SIMDType xmm2( A.load(i+1UL,j) * x1 );
558 SIMDType xmm3( A.load(i+2UL,j) * x1 );
559 SIMDType xmm4( A.load(i+3UL,j) * x1 );
560 SIMDType xmm5( A.load(i+4UL,j) * x1 );
561 SIMDType xmm6( A.load(i+5UL,j) * x1 );
562 SIMDType xmm7( A.load(i+6UL,j) * x1 );
563 SIMDType xmm8( A.load(i+7UL,j) * x1 );
567 xmm1 += A.load(i ,j) * x1;
568 xmm2 += A.load(i+1UL,j) * x1;
569 xmm3 += A.load(i+2UL,j) * x1;
570 xmm4 += A.load(i+3UL,j) * x1;
571 xmm5 += A.load(i+4UL,j) * x1;
572 xmm6 += A.load(i+5UL,j) * x1;
573 xmm7 += A.load(i+6UL,j) * x1;
574 xmm8 += A.load(i+7UL,j) * x1;
578 y[i+1UL] =
sum( xmm2 );
579 y[i+2UL] =
sum( xmm3 );
580 y[i+3UL] =
sum( xmm4 );
581 y[i+4UL] =
sum( xmm5 );
582 y[i+5UL] =
sum( xmm6 );
583 y[i+6UL] =
sum( xmm7 );
584 y[i+7UL] =
sum( xmm8 );
586 for( ; remainder && j<jend; ++j ) {
587 y[i ] += A(i ,j) * x[j];
588 y[i+1UL] += A(i+1UL,j) * x[j];
589 y[i+2UL] += A(i+2UL,j) * x[j];
590 y[i+3UL] += A(i+3UL,j) * x[j];
591 y[i+4UL] += A(i+4UL,j) * x[j];
592 y[i+5UL] += A(i+5UL,j) * x[j];
593 y[i+6UL] += A(i+6UL,j) * x[j];
594 y[i+7UL] += A(i+7UL,j) * x[j];
608 for( ++j; j<jend; ++j ) {
609 value1 += A(i ,j) * x[j];
610 value2 += A(i+1UL,j) * x[j];
611 value3 += A(i+2UL,j) * x[j];
612 value4 += A(i+3UL,j) * x[j];
613 value5 += A(i+4UL,j) * x[j];
614 value6 += A(i+5UL,j) * x[j];
615 value7 += A(i+6UL,j) * x[j];
616 value8 += A(i+7UL,j) * x[j];
630 for( ; (i+4UL) <= M; i+=4UL )
632 const size_t jbegin( ( IsUpper_v<MT1> )
635 const size_t jend( ( IsLower_v<MT1> )
636 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
649 SIMDType xmm2( A.load(i+1UL,j) * x1 );
650 SIMDType xmm3( A.load(i+2UL,j) * x1 );
651 SIMDType xmm4( A.load(i+3UL,j) * x1 );
655 xmm1 += A.load(i ,j) * x1;
656 xmm2 += A.load(i+1UL,j) * x1;
657 xmm3 += A.load(i+2UL,j) * x1;
658 xmm4 += A.load(i+3UL,j) * x1;
662 y[i+1UL] =
sum( xmm2 );
663 y[i+2UL] =
sum( xmm3 );
664 y[i+3UL] =
sum( xmm4 );
666 for( ; remainder && j<jend; ++j ) {
667 y[i ] += A(i ,j) * x[j];
668 y[i+1UL] += A(i+1UL,j) * x[j];
669 y[i+2UL] += A(i+2UL,j) * x[j];
670 y[i+3UL] += A(i+3UL,j) * x[j];
680 for( ++j; j<jend; ++j ) {
681 value1 += A(i ,j) * x[j];
682 value2 += A(i+1UL,j) * x[j];
683 value3 += A(i+2UL,j) * x[j];
684 value4 += A(i+3UL,j) * x[j];
694 for( ; (i+3UL) <= M; i+=3UL )
696 const size_t jbegin( ( IsUpper_v<MT1> )
699 const size_t jend( ( IsLower_v<MT1> )
700 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
713 SIMDType xmm2( A.load(i+1UL,j) * x1 );
714 SIMDType xmm3( A.load(i+2UL,j) * x1 );
718 xmm1 += A.load(i ,j) * x1;
719 xmm2 += A.load(i+1UL,j) * x1;
720 xmm3 += A.load(i+2UL,j) * x1;
724 y[i+1UL] =
sum( xmm2 );
725 y[i+2UL] =
sum( xmm3 );
727 for( ; remainder && j<jend; ++j ) {
728 y[i ] += A(i ,j) * x[j];
729 y[i+1UL] += A(i+1UL,j) * x[j];
730 y[i+2UL] += A(i+2UL,j) * x[j];
739 for( ++j; j<jend; ++j ) {
740 value1 += A(i ,j) * x[j];
741 value2 += A(i+1UL,j) * x[j];
742 value3 += A(i+2UL,j) * x[j];
751 for( ; (i+2UL) <= M; i+=2UL )
753 const size_t jbegin( ( IsUpper_v<MT1> )
756 const size_t jend( ( IsLower_v<MT1> )
757 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
770 SIMDType xmm2( A.load(i+1UL,j) * x1 );
774 xmm1 += A.load(i ,j) * x1;
775 xmm2 += A.load(i+1UL,j) * x1;
779 y[i+1UL] =
sum( xmm2 );
781 for( ; remainder && j<jend; ++j ) {
782 y[i ] += A(i ,j) * x[j];
783 y[i+1UL] += A(i+1UL,j) * x[j];
791 for( ++j; j<jend; ++j ) {
792 value1 += A(i ,j) * x[j];
793 value2 += A(i+1UL,j) * x[j];
803 const size_t jbegin( ( IsUpper_v<MT1> )
806 const size_t jend( ( IsLower_v<MT1> )
807 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
818 SIMDType xmm1( A.load(i,j) * x.load(j) );
821 xmm1 += A.load(i,j) * x.load(j);
826 for( ; remainder && j<jend; ++j ) {
827 y[i] += A(i,j) * x[j];
834 for( ++j; j<jend; ++j ) {
835 value += A(i,j) * x[j];
859 template<
typename VT1
862 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
863 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
865 selectDefaultAssignKernel( y, A, x );
884 template<
typename VT1
887 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
888 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
890 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
892 const size_t M( A.rows() );
893 const size_t N( A.columns() );
899 for( ; (i+8UL) <= M; i+=8UL )
901 const size_t jbegin( ( IsUpper_v<MT1> )
904 const size_t jend( ( IsLower_v<MT1> )
905 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
922 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
923 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
924 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
925 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
926 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
927 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
928 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
929 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
936 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
937 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
938 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
939 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
940 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
941 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
942 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
943 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
948 y[i ] +=
sum( A.load(i ,j) * x1 );
949 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
950 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
951 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
952 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
953 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
954 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
955 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
958 for( ; remainder && j<jend; ++j ) {
959 y[i ] += A(i ,j) * x[j];
960 y[i+1UL] += A(i+1UL,j) * x[j];
961 y[i+2UL] += A(i+2UL,j) * x[j];
962 y[i+3UL] += A(i+3UL,j) * x[j];
963 y[i+4UL] += A(i+4UL,j) * x[j];
964 y[i+5UL] += A(i+5UL,j) * x[j];
965 y[i+6UL] += A(i+6UL,j) * x[j];
966 y[i+7UL] += A(i+7UL,j) * x[j];
970 for( ; (i+4UL) <= M; i+=4UL )
972 const size_t jbegin( ( IsUpper_v<MT1> )
975 const size_t jend( ( IsLower_v<MT1> )
976 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
993 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
994 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
995 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
996 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1003 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1004 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1005 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1006 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1011 y[i ] +=
sum( A.load(i ,j) * x1 );
1012 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1013 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1014 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1017 for( ; remainder && j<jend; ++j ) {
1018 y[i ] += A(i ,j) * x[j];
1019 y[i+1UL] += A(i+1UL,j) * x[j];
1020 y[i+2UL] += A(i+2UL,j) * x[j];
1021 y[i+3UL] += A(i+3UL,j) * x[j];
1025 for( ; (i+2UL) <= M; i+=2UL )
1027 const size_t jbegin( ( IsUpper_v<MT1> )
1030 const size_t jend( ( IsLower_v<MT1> )
1031 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1048 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1049 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1056 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1057 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1062 y[i ] +=
sum( A.load(i ,j) * x1 );
1063 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1066 for( ; remainder && j<jend; ++j ) {
1067 y[i ] += A(i ,j) * x[j];
1068 y[i+1UL] += A(i+1UL,j) * x[j];
1074 const size_t jbegin( ( IsUpper_v<MT1> )
1077 const size_t jend( ( IsLower_v<MT1> )
1078 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1095 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1102 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1107 y[i] +=
sum( A.load(i,j) * x1 );
1110 for( ; remainder && j<jend; ++j ) {
1111 y[i] += A(i,j) * x[j];
1132 template<
typename VT1
1135 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1136 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1138 selectLargeAssignKernel( y, A, x );
1144#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1158 template<
typename VT1
1161 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1162 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1164 using ET = ElementType_t<VT1>;
1166 if( IsTriangular_v<MT1> ) {
1168 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1171 gemv( y, A, x, ET(1), ET(0) );
1191 template<
typename VT1 >
1192 friend inline void assign( SparseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1203 assign( *lhs, tmp );
1221 template<
typename VT1 >
1222 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1228 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1229 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1241 DMatDVecMultExpr::selectAddAssignKernel( *lhs, A, x );
1257 template<
typename VT1
1260 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1262 if( ( IsDiagonal_v<MT1> ) ||
1264 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1265 selectSmallAddAssignKernel( y, A, x );
1267 selectBlasAddAssignKernel( y, A, x );
1286 template<
typename VT1
1289 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1291 y.addAssign( A * x );
1310 template<
typename VT1
1313 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1314 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1316 selectDefaultAddAssignKernel( y, A, x );
1335 template<
typename VT1
1338 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1339 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1341 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1343 const size_t M( A.rows() );
1344 const size_t N( A.columns() );
1348 for( ; (i+8UL) <= M; i+=8UL )
1350 const size_t jbegin( ( IsUpper_v<MT1> )
1353 const size_t jend( ( IsLower_v<MT1> )
1354 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1366 SIMDType xmm1( A.load(i ,j) * x1 );
1367 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1368 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1369 SIMDType xmm4( A.load(i+3UL,j) * x1 );
1370 SIMDType xmm5( A.load(i+4UL,j) * x1 );
1371 SIMDType xmm6( A.load(i+5UL,j) * x1 );
1372 SIMDType xmm7( A.load(i+6UL,j) * x1 );
1373 SIMDType xmm8( A.load(i+7UL,j) * x1 );
1377 xmm1 += A.load(i ,j) * x1;
1378 xmm2 += A.load(i+1UL,j) * x1;
1379 xmm3 += A.load(i+2UL,j) * x1;
1380 xmm4 += A.load(i+3UL,j) * x1;
1381 xmm5 += A.load(i+4UL,j) * x1;
1382 xmm6 += A.load(i+5UL,j) * x1;
1383 xmm7 += A.load(i+6UL,j) * x1;
1384 xmm8 += A.load(i+7UL,j) * x1;
1387 y[i ] +=
sum( xmm1 );
1388 y[i+1UL] +=
sum( xmm2 );
1389 y[i+2UL] +=
sum( xmm3 );
1390 y[i+3UL] +=
sum( xmm4 );
1391 y[i+4UL] +=
sum( xmm5 );
1392 y[i+5UL] +=
sum( xmm6 );
1393 y[i+6UL] +=
sum( xmm7 );
1394 y[i+7UL] +=
sum( xmm8 );
1396 for( ; remainder && j<jend; ++j ) {
1397 y[i ] += A(i ,j) * x[j];
1398 y[i+1UL] += A(i+1UL,j) * x[j];
1399 y[i+2UL] += A(i+2UL,j) * x[j];
1400 y[i+3UL] += A(i+3UL,j) * x[j];
1401 y[i+4UL] += A(i+4UL,j) * x[j];
1402 y[i+5UL] += A(i+5UL,j) * x[j];
1403 y[i+6UL] += A(i+6UL,j) * x[j];
1404 y[i+7UL] += A(i+7UL,j) * x[j];
1418 for( ++j; j<jend; ++j ) {
1419 value1 += A(i ,j) * x[j];
1420 value2 += A(i+1UL,j) * x[j];
1421 value3 += A(i+2UL,j) * x[j];
1422 value4 += A(i+3UL,j) * x[j];
1423 value5 += A(i+4UL,j) * x[j];
1424 value6 += A(i+5UL,j) * x[j];
1425 value7 += A(i+6UL,j) * x[j];
1426 value8 += A(i+7UL,j) * x[j];
1440 for( ; (i+4UL) <= M; i+=4UL )
1442 const size_t jbegin( ( IsUpper_v<MT1> )
1445 const size_t jend( ( IsLower_v<MT1> )
1446 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1458 SIMDType xmm1( A.load(i ,j) * x1 );
1459 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1460 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1461 SIMDType xmm4( A.load(i+3UL,j) * x1 );
1465 xmm1 += A.load(i ,j) * x1;
1466 xmm2 += A.load(i+1UL,j) * x1;
1467 xmm3 += A.load(i+2UL,j) * x1;
1468 xmm4 += A.load(i+3UL,j) * x1;
1471 y[i ] +=
sum( xmm1 );
1472 y[i+1UL] +=
sum( xmm2 );
1473 y[i+2UL] +=
sum( xmm3 );
1474 y[i+3UL] +=
sum( xmm4 );
1476 for( ; remainder && j<jend; ++j ) {
1477 y[i ] += A(i ,j) * x[j];
1478 y[i+1UL] += A(i+1UL,j) * x[j];
1479 y[i+2UL] += A(i+2UL,j) * x[j];
1480 y[i+3UL] += A(i+3UL,j) * x[j];
1490 for( ++j; j<jend; ++j ) {
1491 value1 += A(i ,j) * x[j];
1492 value2 += A(i+1UL,j) * x[j];
1493 value3 += A(i+2UL,j) * x[j];
1494 value4 += A(i+3UL,j) * x[j];
1504 for( ; (i+3UL) <= M; i+=3UL )
1506 const size_t jbegin( ( IsUpper_v<MT1> )
1509 const size_t jend( ( IsLower_v<MT1> )
1510 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
1522 SIMDType xmm1( A.load(i ,j) * x1 );
1523 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1524 SIMDType xmm3( A.load(i+2UL,j) * x1 );
1528 xmm1 += A.load(i ,j) * x1;
1529 xmm2 += A.load(i+1UL,j) * x1;
1530 xmm3 += A.load(i+2UL,j) * x1;
1533 y[i ] +=
sum( xmm1 );
1534 y[i+1UL] +=
sum( xmm2 );
1535 y[i+2UL] +=
sum( xmm3 );
1537 for( ; remainder && j<jend; ++j ) {
1538 y[i ] += A(i ,j) * x[j];
1539 y[i+1UL] += A(i+1UL,j) * x[j];
1540 y[i+2UL] += A(i+2UL,j) * x[j];
1549 for( ++j; j<jend; ++j ) {
1550 value1 += A(i ,j) * x[j];
1551 value2 += A(i+1UL,j) * x[j];
1552 value3 += A(i+2UL,j) * x[j];
1561 for( ; (i+2UL) <= M; i+=2UL )
1563 const size_t jbegin( ( IsUpper_v<MT1> )
1566 const size_t jend( ( IsLower_v<MT1> )
1567 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1579 SIMDType xmm1( A.load(i ,j) * x1 );
1580 SIMDType xmm2( A.load(i+1UL,j) * x1 );
1584 xmm1 += A.load(i ,j) * x1;
1585 xmm2 += A.load(i+1UL,j) * x1;
1588 y[i ] +=
sum( xmm1 );
1589 y[i+1UL] +=
sum( xmm2 );
1591 for( ; remainder && j<jend; ++j ) {
1592 y[i ] += A(i ,j) * x[j];
1593 y[i+1UL] += A(i+1UL,j) * x[j];
1601 for( ++j; j<jend; ++j ) {
1602 value1 += A(i ,j) * x[j];
1603 value2 += A(i+1UL,j) * x[j];
1613 const size_t jbegin( ( IsUpper_v<MT1> )
1616 const size_t jend( ( IsLower_v<MT1> )
1617 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1628 SIMDType xmm1( A.load(i,j) * x.load(j) );
1631 xmm1 += A.load(i,j) * x.load(j);
1634 y[i] +=
sum( xmm1 );
1636 for( ; remainder && j<jend; ++j ) {
1637 y[i] += A(i,j) * x[j];
1644 for( ++j; j<jend; ++j ) {
1645 value += A(i,j) * x[j];
1669 template<
typename VT1
1672 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1673 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1675 selectDefaultAddAssignKernel( y, A, x );
1694 template<
typename VT1
1697 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1698 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
1700 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
1702 const size_t M( A.rows() );
1703 const size_t N( A.columns() );
1707 for( ; (i+8UL) <= M; i+=8UL )
1709 const size_t jbegin( ( IsUpper_v<MT1> )
1712 const size_t jend( ( IsLower_v<MT1> )
1713 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
1730 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1731 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1732 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1733 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1734 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1735 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1736 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1737 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1744 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1745 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1746 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1747 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1748 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1749 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1750 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1751 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1756 y[i ] +=
sum( A.load(i ,j) * x1 );
1757 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1758 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1759 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1760 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1761 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1762 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1763 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1766 for( ; remainder && j<jend; ++j ) {
1767 y[i ] += A(i ,j) * x[j];
1768 y[i+1UL] += A(i+1UL,j) * x[j];
1769 y[i+2UL] += A(i+2UL,j) * x[j];
1770 y[i+3UL] += A(i+3UL,j) * x[j];
1771 y[i+4UL] += A(i+4UL,j) * x[j];
1772 y[i+5UL] += A(i+5UL,j) * x[j];
1773 y[i+6UL] += A(i+6UL,j) * x[j];
1774 y[i+7UL] += A(i+7UL,j) * x[j];
1778 for( ; (i+4UL) <= M; i+=4UL )
1780 const size_t jbegin( ( IsUpper_v<MT1> )
1783 const size_t jend( ( IsLower_v<MT1> )
1784 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
1801 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1802 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1803 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1804 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1811 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1812 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1813 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1814 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1819 y[i ] +=
sum( A.load(i ,j) * x1 );
1820 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1821 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1822 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1825 for( ; remainder && j<jend; ++j ) {
1826 y[i ] += A(i ,j) * x[j];
1827 y[i+1UL] += A(i+1UL,j) * x[j];
1828 y[i+2UL] += A(i+2UL,j) * x[j];
1829 y[i+3UL] += A(i+3UL,j) * x[j];
1833 for( ; (i+2UL) <= M; i+=2UL )
1835 const size_t jbegin( ( IsUpper_v<MT1> )
1838 const size_t jend( ( IsLower_v<MT1> )
1839 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
1856 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1857 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1864 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1865 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1870 y[i ] +=
sum( A.load(i ,j) * x1 );
1871 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1874 for( ; remainder && j<jend; ++j ) {
1875 y[i ] += A(i ,j) * x[j];
1876 y[i+1UL] += A(i+1UL,j) * x[j];
1882 const size_t jbegin( ( IsUpper_v<MT1> )
1885 const size_t jend( ( IsLower_v<MT1> )
1886 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
1903 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1910 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1915 y[i] +=
sum( A.load(i,j) * x1 );
1918 for( ; remainder && j<jend; ++j ) {
1919 y[i] += A(i,j) * x[j];
1940 template<
typename VT1
1943 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1944 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1946 selectLargeAddAssignKernel( y, A, x );
1952#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1966 template<
typename VT1
1969 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1970 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
1972 using ET = ElementType_t<VT1>;
1974 if( IsTriangular_v<MT1> ) {
1975 ResultType_t<VT1> tmp(
serial( x ) );
1976 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1977 addAssign( y, tmp );
1980 gemv( y, A, x, ET(1), ET(1) );
2004 template<
typename VT1 >
2005 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2011 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2012 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2024 DMatDVecMultExpr::selectSubAssignKernel( *lhs, A, x );
2040 template<
typename VT1
2043 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2045 if( ( IsDiagonal_v<MT1> ) ||
2047 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
2048 selectSmallSubAssignKernel( y, A, x );
2050 selectBlasSubAssignKernel( y, A, x );
2069 template<
typename VT1
2072 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2074 y.subAssign( A * x );
2093 template<
typename VT1
2096 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2097 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2099 selectDefaultSubAssignKernel( y, A, x );
2118 template<
typename VT1
2121 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2122 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2124 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2126 const size_t M( A.rows() );
2127 const size_t N( A.columns() );
2131 for( ; (i+8UL) <= M; i+=8UL )
2133 const size_t jbegin( ( IsUpper_v<MT1> )
2136 const size_t jend( ( IsLower_v<MT1> )
2137 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2149 SIMDType xmm1( A.load(i ,j) * x1 );
2150 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2151 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2152 SIMDType xmm4( A.load(i+3UL,j) * x1 );
2153 SIMDType xmm5( A.load(i+4UL,j) * x1 );
2154 SIMDType xmm6( A.load(i+5UL,j) * x1 );
2155 SIMDType xmm7( A.load(i+6UL,j) * x1 );
2156 SIMDType xmm8( A.load(i+7UL,j) * x1 );
2160 xmm1 += A.load(i ,j) * x1;
2161 xmm2 += A.load(i+1UL,j) * x1;
2162 xmm3 += A.load(i+2UL,j) * x1;
2163 xmm4 += A.load(i+3UL,j) * x1;
2164 xmm5 += A.load(i+4UL,j) * x1;
2165 xmm6 += A.load(i+5UL,j) * x1;
2166 xmm7 += A.load(i+6UL,j) * x1;
2167 xmm8 += A.load(i+7UL,j) * x1;
2170 y[i ] -=
sum( xmm1 );
2171 y[i+1UL] -=
sum( xmm2 );
2172 y[i+2UL] -=
sum( xmm3 );
2173 y[i+3UL] -=
sum( xmm4 );
2174 y[i+4UL] -=
sum( xmm5 );
2175 y[i+5UL] -=
sum( xmm6 );
2176 y[i+6UL] -=
sum( xmm7 );
2177 y[i+7UL] -=
sum( xmm8 );
2179 for( ; remainder && j<jend; ++j ) {
2180 y[i ] -= A(i ,j) * x[j];
2181 y[i+1UL] -= A(i+1UL,j) * x[j];
2182 y[i+2UL] -= A(i+2UL,j) * x[j];
2183 y[i+3UL] -= A(i+3UL,j) * x[j];
2184 y[i+4UL] -= A(i+4UL,j) * x[j];
2185 y[i+5UL] -= A(i+5UL,j) * x[j];
2186 y[i+6UL] -= A(i+6UL,j) * x[j];
2187 y[i+7UL] -= A(i+7UL,j) * x[j];
2201 for( ++j; j<jend; ++j ) {
2202 value1 += A(i ,j) * x[j];
2203 value2 += A(i+1UL,j) * x[j];
2204 value3 += A(i+2UL,j) * x[j];
2205 value4 += A(i+3UL,j) * x[j];
2206 value5 += A(i+4UL,j) * x[j];
2207 value6 += A(i+5UL,j) * x[j];
2208 value7 += A(i+6UL,j) * x[j];
2209 value8 += A(i+7UL,j) * x[j];
2223 for( ; (i+4UL) <= M; i+=4UL )
2225 const size_t jbegin( ( IsUpper_v<MT1> )
2228 const size_t jend( ( IsLower_v<MT1> )
2229 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2241 SIMDType xmm1( A.load(i ,j) * x1 );
2242 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2243 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2244 SIMDType xmm4( A.load(i+3UL,j) * x1 );
2248 xmm1 += A.load(i ,j) * x1;
2249 xmm2 += A.load(i+1UL,j) * x1;
2250 xmm3 += A.load(i+2UL,j) * x1;
2251 xmm4 += A.load(i+3UL,j) * x1;
2254 y[i ] -=
sum( xmm1 );
2255 y[i+1UL] -=
sum( xmm2 );
2256 y[i+2UL] -=
sum( xmm3 );
2257 y[i+3UL] -=
sum( xmm4 );
2259 for( ; remainder && j<jend; ++j ) {
2260 y[i ] -= A(i ,j) * x[j];
2261 y[i+1UL] -= A(i+1UL,j) * x[j];
2262 y[i+2UL] -= A(i+2UL,j) * x[j];
2263 y[i+3UL] -= A(i+3UL,j) * x[j];
2273 for( ++j; j<jend; ++j ) {
2274 value1 += A(i ,j) * x[j];
2275 value2 += A(i+1UL,j) * x[j];
2276 value3 += A(i+2UL,j) * x[j];
2277 value4 += A(i+3UL,j) * x[j];
2287 for( ; (i+3UL) <= M; i+=3UL )
2289 const size_t jbegin( ( IsUpper_v<MT1> )
2292 const size_t jend( ( IsLower_v<MT1> )
2293 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
2305 SIMDType xmm1( A.load(i ,j) * x1 );
2306 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2307 SIMDType xmm3( A.load(i+2UL,j) * x1 );
2311 xmm1 += A.load(i ,j) * x1;
2312 xmm2 += A.load(i+1UL,j) * x1;
2313 xmm3 += A.load(i+2UL,j) * x1;
2316 y[i ] -=
sum( xmm1 );
2317 y[i+1UL] -=
sum( xmm2 );
2318 y[i+2UL] -=
sum( xmm3 );
2320 for( ; remainder && j<jend; ++j ) {
2321 y[i ] -= A(i ,j) * x[j];
2322 y[i+1UL] -= A(i+1UL,j) * x[j];
2323 y[i+2UL] -= A(i+2UL,j) * x[j];
2332 for( ++j; j<jend; ++j ) {
2333 value1 += A(i ,j) * x[j];
2334 value2 += A(i+1UL,j) * x[j];
2335 value3 += A(i+2UL,j) * x[j];
2344 for( ; (i+2UL) <= M; i+=2UL )
2346 const size_t jbegin( ( IsUpper_v<MT1> )
2349 const size_t jend( ( IsLower_v<MT1> )
2350 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2362 SIMDType xmm1( A.load(i ,j) * x1 );
2363 SIMDType xmm2( A.load(i+1UL,j) * x1 );
2367 xmm1 += A.load(i ,j) * x1;
2368 xmm2 += A.load(i+1UL,j) * x1;
2371 y[i ] -=
sum( xmm1 );
2372 y[i+1UL] -=
sum( xmm2 );
2374 for( ; remainder && j<jend; ++j ) {
2375 y[i ] -= A(i ,j) * x[j];
2376 y[i+1UL] -= A(i+1UL,j) * x[j];
2384 for( ++j; j<jend; ++j ) {
2385 value1 += A(i ,j) * x[j];
2386 value2 += A(i+1UL,j) * x[j];
2396 const size_t jbegin( ( IsUpper_v<MT1> )
2399 const size_t jend( ( IsLower_v<MT1> )
2400 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2415 xmm1 += A.load(i,j) * x.load(j);
2418 y[i] -=
sum( xmm1 );
2420 for( ; remainder && j<jend; ++j ) {
2421 y[i] -= A(i,j) * x[j];
2428 for( ++j; j<jend; ++j ) {
2429 value += A(i,j) * x[j];
2453 template<
typename VT1
2456 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2457 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2459 selectDefaultSubAssignKernel( y, A, x );
2478 template<
typename VT1
2481 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2482 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2> >
2484 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
2486 const size_t M( A.rows() );
2487 const size_t N( A.columns() );
2491 for( ; (i+8UL) <= M; i+=8UL )
2493 const size_t jbegin( ( IsUpper_v<MT1> )
2496 const size_t jend( ( IsLower_v<MT1> )
2497 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
2514 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2515 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2516 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2517 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2518 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2519 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2520 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2521 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2528 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2529 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2530 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2531 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2532 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2533 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2534 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2535 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2540 y[i ] -=
sum( A.load(i ,j) * x1 );
2541 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2542 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2543 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2544 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2545 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2546 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2547 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2550 for( ; remainder && j<jend; ++j ) {
2551 y[i ] -= A(i ,j) * x[j];
2552 y[i+1UL] -= A(i+1UL,j) * x[j];
2553 y[i+2UL] -= A(i+2UL,j) * x[j];
2554 y[i+3UL] -= A(i+3UL,j) * x[j];
2555 y[i+4UL] -= A(i+4UL,j) * x[j];
2556 y[i+5UL] -= A(i+5UL,j) * x[j];
2557 y[i+6UL] -= A(i+6UL,j) * x[j];
2558 y[i+7UL] -= A(i+7UL,j) * x[j];
2562 for( ; (i+4UL) <= M; i+=4UL )
2564 const size_t jbegin( ( IsUpper_v<MT1> )
2567 const size_t jend( ( IsLower_v<MT1> )
2568 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
2585 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2586 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2587 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2588 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2595 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2596 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2597 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2598 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2603 y[i ] -=
sum( A.load(i ,j) * x1 );
2604 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2605 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2606 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2609 for( ; remainder && j<jend; ++j ) {
2610 y[i ] -= A(i ,j) * x[j];
2611 y[i+1UL] -= A(i+1UL,j) * x[j];
2612 y[i+2UL] -= A(i+2UL,j) * x[j];
2613 y[i+3UL] -= A(i+3UL,j) * x[j];
2617 for( ; (i+2UL) <= M; i+=2UL )
2619 const size_t jbegin( ( IsUpper_v<MT1> )
2622 const size_t jend( ( IsLower_v<MT1> )
2623 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
2640 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2641 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2648 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2649 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2654 y[i ] -=
sum( A.load(i ,j) * x1 );
2655 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2658 for( ; remainder && j<jend; ++j ) {
2659 y[i ] -= A(i ,j) * x[j];
2660 y[i+1UL] -= A(i+1UL,j) * x[j];
2666 const size_t jbegin( ( IsUpper_v<MT1> )
2669 const size_t jend( ( IsLower_v<MT1> )
2670 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
2687 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2694 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2699 y[i] -=
sum( A.load(i,j) * x1 );
2702 for( ; remainder && j<jend; ++j ) {
2703 y[i] -= A(i,j) * x[j];
2724 template<
typename VT1
2727 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2728 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2730 selectLargeSubAssignKernel( y, A, x );
2736#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2750 template<
typename VT1
2753 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2754 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2> >
2756 using ET = ElementType_t<VT1>;
2758 if( IsTriangular_v<MT1> ) {
2759 ResultType_t<VT1> tmp(
serial( x ) );
2760 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2761 subAssign( y, tmp );
2764 gemv( y, A, x, ET(-1), ET(1) );
2788 template<
typename VT1 >
2789 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2800 multAssign( *lhs, tmp );
2822 template<
typename VT1 >
2823 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2834 divAssign( *lhs, tmp );
2858 template<
typename VT1 >
2860 -> EnableIf_t< UseSMPAssign_v<VT1> >
2866 if( rhs.mat_.rows() == 0UL ) {
2869 else if( rhs.mat_.columns() == 0UL ||
2870 ( IsStrictlyTriangular_v<MT> && rhs.mat_.columns() == 1UL ) ) {
2903 template<
typename VT1 >
2905 -> EnableIf_t< UseSMPAssign_v<VT1> >
2936 template<
typename VT1 >
2938 -> EnableIf_t< UseSMPAssign_v<VT1> >
2944 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2945 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2981 template<
typename VT1 >
2983 -> EnableIf_t< UseSMPAssign_v<VT1> >
2989 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2990 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
3026 template<
typename VT1 >
3028 -> EnableIf_t< UseSMPAssign_v<VT1> >
3063 template<
typename VT1 >
3065 -> EnableIf_t< UseSMPAssign_v<VT1> >
3114template<
typename MT
3117class DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >
3118 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false > >
3119 ,
private Computation
3123 using MVM = DMatDVecMultExpr<MT,VT>;
3124 using RES = ResultType_t<MVM>;
3125 using MRT = ResultType_t<MT>;
3126 using VRT = ResultType_t<VT>;
3127 using MET = ElementType_t<MRT>;
3128 using VET = ElementType_t<VRT>;
3129 using MCT = CompositeType_t<MT>;
3130 using VCT = CompositeType_t<VT>;
3135 static constexpr bool evaluateMatrix =
3136 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
3137 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
3142 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<MT> );
3150 template<
typename T1 >
3151 static constexpr bool UseSMPAssign_v = ( evaluateMatrix || evaluateVector );
3158 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3159 static constexpr bool UseBlasKernel_v =
3161 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
3162 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
3163 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
3164 !IsDiagonal_v<T2> &&
3165 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3166 IsBLASCompatible_v< ElementType_t<T1> > &&
3167 IsBLASCompatible_v< ElementType_t<T2> > &&
3168 IsBLASCompatible_v< ElementType_t<T3> > &&
3169 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
3170 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
3171 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
3179 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3180 static constexpr bool UseVectorizedDefaultKernel_v =
3181 ( useOptimizedKernels &&
3182 !IsDiagonal_v<T2> &&
3183 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3184 IsSIMDCombinable_v< ElementType_t<T1>
3188 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
3189 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
3195 using This = DVecScalarMultExpr<MVM,ST,false>;
3198 using BaseType = VecScalarMultExpr< DenseVector<This,false> >;
3203 using SIMDType = SIMDTrait_t<ElementType>;
3208 using LeftOperand =
const DMatDVecMultExpr<MT,VT>;
3214 using LT = If_t< evaluateMatrix, const MRT, MCT >;
3217 using RT = If_t< evaluateVector, const VRT, VCT >;
3223 ( !IsDiagonal_v<MT> &&
3224 MT::simdEnabled && VT::simdEnabled &&
3225 IsSIMDCombinable_v<MET,VET,ST> &&
3226 HasSIMDAdd_v<MET,VET> &&
3227 HasSIMDMult_v<MET,VET> );
3231 ( !evaluateMatrix && MT::smpAssignable && !evaluateVector && VT::smpAssignable );
3271 if( index >=
vector_.size() ) {
3274 return (*
this)[index];
3283 inline size_t size()
const {
3314 template<
typename T >
3315 inline bool canAlias(
const T* alias )
const {
3316 return vector_.canAlias( alias );
3326 template<
typename T >
3327 inline bool isAliased(
const T* alias )
const {
3328 return vector_.isAliased( alias );
3348 LeftOperand_t<MVM> A(
vector_.leftOperand() );
3352 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3353 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
3354 (
size() > SMP_DMATDVECMULT_THRESHOLD );
3376 template<
typename VT1 >
3377 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3383 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
3384 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
3386 if( left.rows() == 0UL ) {
3389 else if( left.columns() == 0UL ||
3390 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
3403 DVecScalarMultExpr::selectAssignKernel( *lhs, A, x, rhs.scalar_ );
3418 template<
typename VT1
3422 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3424 if( ( IsDiagonal_v<MT1> ) ||
3425 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3426 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3427 selectSmallAssignKernel( y, A, x, scalar );
3429 selectBlasAssignKernel( y, A, x, scalar );
3447 template<
typename VT1
3451 static inline auto selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3452 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3454 y.assign( A * x * scalar );
3472 template<
typename VT1
3476 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3477 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3479 selectDefaultAssignKernel( y, A, x, scalar );
3497 template<
typename VT1
3501 static inline auto selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3502 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3504 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3506 const size_t M( A.rows() );
3507 const size_t N( A.columns() );
3511 for( ; (i+8UL) <= M; i+=8UL )
3513 const size_t jbegin( ( IsUpper_v<MT1> )
3516 const size_t jend( ( IsLower_v<MT1> )
3517 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3528 SIMDType x1( x.load(j) );
3529 SIMDType xmm1( A.load(i ,j) * x1 );
3530 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3531 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3532 SIMDType xmm4( A.load(i+3UL,j) * x1 );
3533 SIMDType xmm5( A.load(i+4UL,j) * x1 );
3534 SIMDType xmm6( A.load(i+5UL,j) * x1 );
3535 SIMDType xmm7( A.load(i+6UL,j) * x1 );
3536 SIMDType xmm8( A.load(i+7UL,j) * x1 );
3540 xmm1 += A.load(i ,j) * x1;
3541 xmm2 += A.load(i+1UL,j) * x1;
3542 xmm3 += A.load(i+2UL,j) * x1;
3543 xmm4 += A.load(i+3UL,j) * x1;
3544 xmm5 += A.load(i+4UL,j) * x1;
3545 xmm6 += A.load(i+5UL,j) * x1;
3546 xmm7 += A.load(i+6UL,j) * x1;
3547 xmm8 += A.load(i+7UL,j) * x1;
3550 y[i ] =
sum( xmm1 ) * scalar;
3551 y[i+1UL] =
sum( xmm2 ) * scalar;
3552 y[i+2UL] =
sum( xmm3 ) * scalar;
3553 y[i+3UL] =
sum( xmm4 ) * scalar;
3554 y[i+4UL] =
sum( xmm5 ) * scalar;
3555 y[i+5UL] =
sum( xmm6 ) * scalar;
3556 y[i+6UL] =
sum( xmm7 ) * scalar;
3557 y[i+7UL] =
sum( xmm8 ) * scalar;
3559 for( ; remainder && j<jend; ++j ) {
3560 y[i ] += A(i ,j) * x[j] * scalar;
3561 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3562 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3563 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3564 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3565 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3566 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3567 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3581 for( ++j; j<jend; ++j ) {
3582 value1 += A(i ,j) * x[j];
3583 value2 += A(i+1UL,j) * x[j];
3584 value3 += A(i+2UL,j) * x[j];
3585 value4 += A(i+3UL,j) * x[j];
3586 value5 += A(i+4UL,j) * x[j];
3587 value6 += A(i+5UL,j) * x[j];
3588 value7 += A(i+6UL,j) * x[j];
3589 value8 += A(i+7UL,j) * x[j];
3592 y[i ] = value1 * scalar;
3593 y[i+1UL] = value2 * scalar;
3594 y[i+2UL] = value3 * scalar;
3595 y[i+3UL] = value4 * scalar;
3596 y[i+4UL] = value5 * scalar;
3597 y[i+5UL] = value6 * scalar;
3598 y[i+6UL] = value7 * scalar;
3599 y[i+7UL] = value8 * scalar;
3603 for( ; (i+4UL) <= M; i+=4UL )
3605 const size_t jbegin( ( IsUpper_v<MT1> )
3608 const size_t jend( ( IsLower_v<MT1> )
3609 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3620 SIMDType x1( x.load(j) );
3621 SIMDType xmm1( A.load(i ,j) * x1 );
3622 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3623 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3624 SIMDType xmm4( A.load(i+3UL,j) * x1 );
3628 xmm1 += A.load(i ,j) * x1;
3629 xmm2 += A.load(i+1UL,j) * x1;
3630 xmm3 += A.load(i+2UL,j) * x1;
3631 xmm4 += A.load(i+3UL,j) * x1;
3634 y[i ] =
sum( xmm1 ) * scalar;
3635 y[i+1UL] =
sum( xmm2 ) * scalar;
3636 y[i+2UL] =
sum( xmm3 ) * scalar;
3637 y[i+3UL] =
sum( xmm4 ) * scalar;
3639 for( ; remainder && j<jend; ++j ) {
3640 y[i ] += A(i ,j) * x[j] * scalar;
3641 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3642 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3643 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3653 for( ++j; j<jend; ++j ) {
3654 value1 += A(i ,j) * x[j];
3655 value2 += A(i+1UL,j) * x[j];
3656 value3 += A(i+2UL,j) * x[j];
3657 value4 += A(i+3UL,j) * x[j];
3660 y[i ] = value1 * scalar;
3661 y[i+1UL] = value2 * scalar;
3662 y[i+2UL] = value3 * scalar;
3663 y[i+3UL] = value4 * scalar;
3667 for( ; (i+3UL) <= M; i+=3UL )
3669 const size_t jbegin( ( IsUpper_v<MT1> )
3672 const size_t jend( ( IsLower_v<MT1> )
3673 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
3684 SIMDType x1( x.load(j) );
3685 SIMDType xmm1( A.load(i ,j) * x1 );
3686 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3687 SIMDType xmm3( A.load(i+2UL,j) * x1 );
3691 xmm1 += A.load(i ,j) * x1;
3692 xmm2 += A.load(i+1UL,j) * x1;
3693 xmm3 += A.load(i+2UL,j) * x1;
3696 y[i ] =
sum( xmm1 ) * scalar;
3697 y[i+1UL] =
sum( xmm2 ) * scalar;
3698 y[i+2UL] =
sum( xmm3 ) * scalar;
3700 for( ; remainder && j<jend; ++j ) {
3701 y[i ] += A(i ,j) * x[j] * scalar;
3702 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3703 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3712 for( ++j; j<jend; ++j ) {
3713 value1 += A(i ,j) * x[j];
3714 value2 += A(i+1UL,j) * x[j];
3715 value3 += A(i+2UL,j) * x[j];
3718 y[i ] = value1 * scalar;
3719 y[i+1UL] = value2 * scalar;
3720 y[i+2UL] = value3 * scalar;
3724 for( ; (i+2UL) <= M; i+=2UL )
3726 const size_t jbegin( ( IsUpper_v<MT1> )
3729 const size_t jend( ( IsLower_v<MT1> )
3730 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
3741 SIMDType x1( x.load(j) );
3742 SIMDType xmm1( A.load(i ,j) * x1 );
3743 SIMDType xmm2( A.load(i+1UL,j) * x1 );
3747 xmm1 += A.load(i ,j) * x1;
3748 xmm2 += A.load(i+1UL,j) * x1;
3751 y[i ] =
sum( xmm1 ) * scalar;
3752 y[i+1UL] =
sum( xmm2 ) * scalar;
3754 for( ; remainder && j<jend; ++j ) {
3755 y[i ] += A(i ,j) * x[j] * scalar;
3756 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3764 for( ++j; j<jend; ++j ) {
3765 value1 += A(i ,j) * x[j];
3766 value2 += A(i+1UL,j) * x[j];
3769 y[i ] = value1 * scalar;
3770 y[i+1UL] = value2 * scalar;
3776 const size_t jbegin( ( IsUpper_v<MT1> )
3779 const size_t jend( ( IsLower_v<MT1> )
3780 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
3791 SIMDType xmm1( A.load(i,j) * x.load(j) );
3794 xmm1 += A.load(i,j) * x.load(j);
3797 y[i] =
sum( xmm1 ) * scalar;
3799 for( ; remainder && j<jend; ++j ) {
3800 y[i] += A(i,j) * x[j] * scalar;
3807 for( ++j; j<jend; ++j ) {
3808 value += A(i,j) * x[j];
3811 y[i] = value * scalar;
3831 template<
typename VT1
3835 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3836 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3838 selectDefaultAssignKernel( y, A, x, scalar );
3856 template<
typename VT1
3860 static inline auto selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3861 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
3863 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
3865 const size_t M( A.rows() );
3866 const size_t N( A.columns() );
3872 for( ; (i+8UL) <= M; i+=8UL )
3874 const size_t jbegin( ( IsUpper_v<MT1> )
3877 const size_t jend( ( IsLower_v<MT1> )
3878 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
3891 const SIMDType x1( x.load(j ) );
3892 const SIMDType x2( x.load(j1) );
3893 const SIMDType x3( x.load(j2) );
3894 const SIMDType x4( x.load(j3) );
3895 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3896 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3897 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3898 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3899 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3900 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3901 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3902 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3907 const SIMDType x1( x.load(j ) );
3908 const SIMDType x2( x.load(j1) );
3909 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3910 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3911 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3912 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3913 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3914 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3915 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3916 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3920 const SIMDType x1( x.load(j) );
3921 y[i ] +=
sum( A.load(i ,j) * x1 );
3922 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3923 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3924 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3925 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3926 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3927 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3928 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3931 for( ; remainder && j<jend; ++j ) {
3932 y[i ] += A(i ,j) * x[j];
3933 y[i+1UL] += A(i+1UL,j) * x[j];
3934 y[i+2UL] += A(i+2UL,j) * x[j];
3935 y[i+3UL] += A(i+3UL,j) * x[j];
3936 y[i+4UL] += A(i+4UL,j) * x[j];
3937 y[i+5UL] += A(i+5UL,j) * x[j];
3938 y[i+6UL] += A(i+6UL,j) * x[j];
3939 y[i+7UL] += A(i+7UL,j) * x[j];
3952 for( ; (i+4UL) <= M; i+=4UL )
3954 const size_t jbegin( ( IsUpper_v<MT1> )
3957 const size_t jend( ( IsLower_v<MT1> )
3958 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
3971 const SIMDType x1( x.load(j ) );
3972 const SIMDType x2( x.load(j1) );
3973 const SIMDType x3( x.load(j2) );
3974 const SIMDType x4( x.load(j3) );
3975 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3976 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3977 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3978 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3983 const SIMDType x1( x.load(j ) );
3984 const SIMDType x2( x.load(j1) );
3985 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3986 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3987 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3988 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3992 const SIMDType x1( x.load(j) );
3993 y[i ] +=
sum( A.load(i ,j) * x1 );
3994 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3995 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3996 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3999 for( ; remainder && j<jend; ++j ) {
4000 y[i ] += A(i ,j) * x[j];
4001 y[i+1UL] += A(i+1UL,j) * x[j];
4002 y[i+2UL] += A(i+2UL,j) * x[j];
4003 y[i+3UL] += A(i+3UL,j) * x[j];
4012 for( ; (i+2UL) <= M; i+=2UL )
4014 const size_t jbegin( ( IsUpper_v<MT1> )
4017 const size_t jend( ( IsLower_v<MT1> )
4018 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4031 const SIMDType x1( x.load(j ) );
4032 const SIMDType x2( x.load(j1) );
4033 const SIMDType x3( x.load(j2) );
4034 const SIMDType x4( x.load(j3) );
4035 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
4036 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
4041 const SIMDType x1( x.load(j ) );
4042 const SIMDType x2( x.load(j1) );
4043 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
4044 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
4048 const SIMDType x1( x.load(j) );
4049 y[i ] +=
sum( A.load(i ,j) * x1 );
4050 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
4053 for( ; remainder && j<jend; ++j ) {
4054 y[i ] += A(i ,j) * x[j];
4055 y[i+1UL] += A(i+1UL,j) * x[j];
4064 const size_t jbegin( ( IsUpper_v<MT1> )
4067 const size_t jend( ( IsLower_v<MT1> )
4068 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4081 const SIMDType x1( x.load(j ) );
4082 const SIMDType x2( x.load(j1) );
4083 const SIMDType x3( x.load(j2) );
4084 const SIMDType x4( x.load(j3) );
4085 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
4090 const SIMDType x1( x.load(j ) );
4091 const SIMDType x2( x.load(j1) );
4092 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
4096 const SIMDType x1( x.load(j) );
4097 y[i] +=
sum( A.load(i,j) * x1 );
4100 for( ; remainder && j<jend; ++j ) {
4101 y[i] += A(i,j) * x[j];
4123 template<
typename VT1
4127 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4128 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4130 selectLargeAssignKernel( y, A, x, scalar );
4135#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4149 template<
typename VT1
4153 static inline auto selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4154 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4156 using ET = ElementType_t<VT1>;
4158 if( IsTriangular_v<MT1> ) {
4159 assign( y, scalar * x );
4160 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4163 gemv( y, A, x,
ET(scalar),
ET(0) );
4181 template<
typename VT1 >
4182 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4193 assign( *lhs, tmp );
4209 template<
typename VT1 >
4210 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4216 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
4217 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
4219 if( left.rows() == 0UL || left.columns() == 0UL ||
4220 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
4232 DVecScalarMultExpr::selectAddAssignKernel( *lhs, A, x, rhs.scalar_ );
4247 template<
typename VT1
4251 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4253 if( ( IsDiagonal_v<MT1> ) ||
4254 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4255 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4256 selectSmallAddAssignKernel( y, A, x, scalar );
4258 selectBlasAddAssignKernel( y, A, x, scalar );
4276 template<
typename VT1
4280 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4282 y.addAssign( A * x * scalar );
4300 template<
typename VT1
4304 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4305 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4307 selectDefaultAddAssignKernel( y, A, x, scalar );
4325 template<
typename VT1
4329 static inline auto selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4330 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4332 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4334 const size_t M( A.rows() );
4335 const size_t N( A.columns() );
4339 for( ; (i+8UL) <= M; i+=8UL )
4341 const size_t jbegin( ( IsUpper_v<MT1> )
4344 const size_t jend( ( IsLower_v<MT1> )
4345 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4356 SIMDType x1( x.load(j) );
4357 SIMDType xmm1( A.load(i ,j) * x1 );
4358 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4359 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4360 SIMDType xmm4( A.load(i+3UL,j) * x1 );
4361 SIMDType xmm5( A.load(i+4UL,j) * x1 );
4362 SIMDType xmm6( A.load(i+5UL,j) * x1 );
4363 SIMDType xmm7( A.load(i+6UL,j) * x1 );
4364 SIMDType xmm8( A.load(i+7UL,j) * x1 );
4368 xmm1 += A.load(i ,j) * x1;
4369 xmm2 += A.load(i+1UL,j) * x1;
4370 xmm3 += A.load(i+2UL,j) * x1;
4371 xmm4 += A.load(i+3UL,j) * x1;
4372 xmm5 += A.load(i+4UL,j) * x1;
4373 xmm6 += A.load(i+5UL,j) * x1;
4374 xmm7 += A.load(i+6UL,j) * x1;
4375 xmm8 += A.load(i+7UL,j) * x1;
4378 y[i ] +=
sum( xmm1 ) * scalar;
4379 y[i+1UL] +=
sum( xmm2 ) * scalar;
4380 y[i+2UL] +=
sum( xmm3 ) * scalar;
4381 y[i+3UL] +=
sum( xmm4 ) * scalar;
4382 y[i+4UL] +=
sum( xmm5 ) * scalar;
4383 y[i+5UL] +=
sum( xmm6 ) * scalar;
4384 y[i+6UL] +=
sum( xmm7 ) * scalar;
4385 y[i+7UL] +=
sum( xmm8 ) * scalar;
4387 for( ; remainder && j<jend; ++j ) {
4388 y[i ] += A(i ,j) * x[j] * scalar;
4389 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4390 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4391 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4392 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4393 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4394 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4395 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4409 for( ++j; j<jend; ++j ) {
4410 value1 += A(i ,j) * x[j];
4411 value2 += A(i+1UL,j) * x[j];
4412 value3 += A(i+2UL,j) * x[j];
4413 value4 += A(i+3UL,j) * x[j];
4414 value5 += A(i+4UL,j) * x[j];
4415 value6 += A(i+5UL,j) * x[j];
4416 value7 += A(i+6UL,j) * x[j];
4417 value8 += A(i+7UL,j) * x[j];
4420 y[i ] += value1 * scalar;
4421 y[i+1UL] += value2 * scalar;
4422 y[i+2UL] += value3 * scalar;
4423 y[i+3UL] += value4 * scalar;
4424 y[i+4UL] += value5 * scalar;
4425 y[i+5UL] += value6 * scalar;
4426 y[i+6UL] += value7 * scalar;
4427 y[i+7UL] += value8 * scalar;
4431 for( ; (i+4UL) <= M; i+=4UL )
4433 const size_t jbegin( ( IsUpper_v<MT1> )
4436 const size_t jend( ( IsLower_v<MT1> )
4437 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4448 SIMDType x1( x.load(j) );
4449 SIMDType xmm1( A.load(i ,j) * x1 );
4450 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4451 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4452 SIMDType xmm4( A.load(i+3UL,j) * x1 );
4456 xmm1 += A.load(i ,j) * x1;
4457 xmm2 += A.load(i+1UL,j) * x1;
4458 xmm3 += A.load(i+2UL,j) * x1;
4459 xmm4 += A.load(i+3UL,j) * x1;
4462 y[i ] +=
sum( xmm1 ) * scalar;
4463 y[i+1UL] +=
sum( xmm2 ) * scalar;
4464 y[i+2UL] +=
sum( xmm3 ) * scalar;
4465 y[i+3UL] +=
sum( xmm4 ) * scalar;
4467 for( ; remainder && j<jend; ++j ) {
4468 y[i ] += A(i ,j) * x[j] * scalar;
4469 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4470 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4471 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4481 for( ++j; j<jend; ++j ) {
4482 value1 += A(i ,j) * x[j];
4483 value2 += A(i+1UL,j) * x[j];
4484 value3 += A(i+2UL,j) * x[j];
4485 value4 += A(i+3UL,j) * x[j];
4488 y[i ] += value1 * scalar;
4489 y[i+1UL] += value2 * scalar;
4490 y[i+2UL] += value3 * scalar;
4491 y[i+3UL] += value4 * scalar;
4495 for( ; (i+3UL) <= M; i+=3UL )
4497 const size_t jbegin( ( IsUpper_v<MT1> )
4500 const size_t jend( ( IsLower_v<MT1> )
4501 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
4512 SIMDType x1( x.load(j) );
4513 SIMDType xmm1( A.load(i ,j) * x1 );
4514 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4515 SIMDType xmm3( A.load(i+2UL,j) * x1 );
4519 xmm1 += A.load(i ,j) * x1;
4520 xmm2 += A.load(i+1UL,j) * x1;
4521 xmm3 += A.load(i+2UL,j) * x1;
4524 y[i ] +=
sum( xmm1 ) * scalar;
4525 y[i+1UL] +=
sum( xmm2 ) * scalar;
4526 y[i+2UL] +=
sum( xmm3 ) * scalar;
4528 for( ; remainder && j<jend; ++j ) {
4529 y[i ] += A(i ,j) * x[j] * scalar;
4530 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4531 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4540 for( ++j; j<jend; ++j ) {
4541 value1 += A(i ,j) * x[j];
4542 value2 += A(i+1UL,j) * x[j];
4543 value3 += A(i+2UL,j) * x[j];
4546 y[i ] += value1 * scalar;
4547 y[i+1UL] += value2 * scalar;
4548 y[i+2UL] += value3 * scalar;
4552 for( ; (i+2UL) <= M; i+=2UL )
4554 const size_t jbegin( ( IsUpper_v<MT1> )
4557 const size_t jend( ( IsLower_v<MT1> )
4558 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4569 SIMDType x1( x.load(j) );
4570 SIMDType xmm1( A.load(i ,j) * x1 );
4571 SIMDType xmm2( A.load(i+1UL,j) * x1 );
4575 xmm1 += A.load(i ,j) * x1;
4576 xmm2 += A.load(i+1UL,j) * x1;
4579 y[i ] +=
sum( xmm1 ) * scalar;
4580 y[i+1UL] +=
sum( xmm2 ) * scalar;
4582 for( ; remainder && j<jend; ++j ) {
4583 y[i ] += A(i ,j) * x[j] * scalar;
4584 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4592 for( ++j; j<jend; ++j ) {
4593 value1 += A(i ,j) * x[j];
4594 value2 += A(i+1UL,j) * x[j];
4597 y[i ] += value1 * scalar;
4598 y[i+1UL] += value2 * scalar;
4604 const size_t jbegin( ( IsUpper_v<MT1> )
4607 const size_t jend( ( IsLower_v<MT1> )
4608 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4619 SIMDType xmm1( A.load(i,j) * x.load(j) );
4622 xmm1 += A.load(i,j) * x.load(j);
4625 y[i] +=
sum( xmm1 ) * scalar;
4627 for( ; remainder && j<jend; ++j ) {
4628 y[i] += A(i,j) * x[j] * scalar;
4635 for( ++j; j<jend; ++j ) {
4636 value += A(i,j) * x[j];
4639 y[i] += value * scalar;
4659 template<
typename VT1
4663 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4664 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4666 selectDefaultAddAssignKernel( y, A, x, scalar );
4684 template<
typename VT1
4688 static inline auto selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4689 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
4691 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
4693 const size_t M( A.rows() );
4694 const size_t N( A.columns() );
4698 for( ; (i+8UL) <= M; i+=8UL )
4700 const size_t jbegin( ( IsUpper_v<MT1> )
4703 const size_t jend( ( IsLower_v<MT1> )
4704 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
4717 const SIMDType x1( x.load(j ) );
4718 const SIMDType x2( x.load(j1) );
4719 const SIMDType x3( x.load(j2) );
4720 const SIMDType x4( x.load(j3) );
4721 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4722 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4723 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4724 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4725 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4726 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4727 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4728 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4733 const SIMDType x1( x.load(j ) );
4734 const SIMDType x2( x.load(j1) );
4735 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4736 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4737 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4738 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4739 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4740 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4741 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4742 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4746 const SIMDType x1( x.load(j) );
4747 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4748 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4749 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4750 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4751 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4752 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4753 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4754 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4757 for( ; remainder && j<jend; ++j ) {
4758 y[i ] += A(i ,j) * x[j] * scalar;
4759 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4760 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4761 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4762 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4763 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4764 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4765 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4769 for( ; (i+4UL) <= M; i+=4UL )
4771 const size_t jbegin( ( IsUpper_v<MT1> )
4774 const size_t jend( ( IsLower_v<MT1> )
4775 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
4788 const SIMDType x1( x.load(j ) );
4789 const SIMDType x2( x.load(j1) );
4790 const SIMDType x3( x.load(j2) );
4791 const SIMDType x4( x.load(j3) );
4792 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4793 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4794 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4795 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4800 const SIMDType x1( x.load(j ) );
4801 const SIMDType x2( x.load(j1) );
4802 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4803 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4804 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4805 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4809 const SIMDType x1( x.load(j) );
4810 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4811 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4812 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4813 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4816 for( ; remainder && j<jend; ++j ) {
4817 y[i ] += A(i ,j) * x[j] * scalar;
4818 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4819 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4820 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4824 for( ; (i+2UL) <= M; i+=2UL )
4826 const size_t jbegin( ( IsUpper_v<MT1> )
4829 const size_t jend( ( IsLower_v<MT1> )
4830 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
4843 const SIMDType x1( x.load(j ) );
4844 const SIMDType x2( x.load(j1) );
4845 const SIMDType x3( x.load(j2) );
4846 const SIMDType x4( x.load(j3) );
4847 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4848 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4853 const SIMDType x1( x.load(j ) );
4854 const SIMDType x2( x.load(j1) );
4855 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4856 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4860 const SIMDType x1( x.load(j) );
4861 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4862 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4865 for( ; remainder && j<jend; ++j ) {
4866 y[i ] += A(i ,j) * x[j] * scalar;
4867 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4873 const size_t jbegin( ( IsUpper_v<MT1> )
4876 const size_t jend( ( IsLower_v<MT1> )
4877 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
4890 const SIMDType x1( x.load(j ) );
4891 const SIMDType x2( x.load(j1) );
4892 const SIMDType x3( x.load(j2) );
4893 const SIMDType x4( x.load(j3) );
4894 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4899 const SIMDType x1( x.load(j ) );
4900 const SIMDType x2( x.load(j1) );
4901 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4905 const SIMDType x1( x.load(j) );
4906 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4909 for( ; remainder && j<jend; ++j ) {
4910 y[i] += A(i,j) * x[j] * scalar;
4930 template<
typename VT1
4934 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4935 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4937 selectLargeAddAssignKernel( y, A, x, scalar );
4942#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4956 template<
typename VT1
4960 static inline auto selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4961 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
4963 using ET = ElementType_t<VT1>;
4965 if( IsTriangular_v<MT1> ) {
4966 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4967 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4968 addAssign( y, tmp );
4971 gemv( y, A, x,
ET(scalar),
ET(1) );
4993 template<
typename VT1 >
4994 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5000 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5001 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5003 if( left.rows() == 0UL || left.columns() == 0UL ||
5004 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
5016 DVecScalarMultExpr::selectSubAssignKernel( *lhs, A, x, rhs.scalar_ );
5031 template<
typename VT1
5035 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5037 if( ( IsDiagonal_v<MT1> ) ||
5038 ( IsComputation_v<MT> && !evaluateMatrix ) ||
5039 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
5040 selectSmallSubAssignKernel( y, A, x, scalar );
5042 selectBlasSubAssignKernel( y, A, x, scalar );
5060 template<
typename VT1
5064 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5066 y.subAssign( A * x * scalar );
5084 template<
typename VT1
5088 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5089 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5091 selectDefaultSubAssignKernel( y, A, x, scalar );
5109 template<
typename VT1
5113 static inline auto selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5114 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5116 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
5118 const size_t M( A.rows() );
5119 const size_t N( A.columns() );
5123 for( ; (i+8UL) <= M; i+=8UL )
5125 const size_t jbegin( ( IsUpper_v<MT1> )
5128 const size_t jend( ( IsLower_v<MT1> )
5129 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
5140 SIMDType x1( x.load(j) );
5141 SIMDType xmm1( A.load(i ,j) * x1 );
5142 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5143 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5144 SIMDType xmm4( A.load(i+3UL,j) * x1 );
5145 SIMDType xmm5( A.load(i+4UL,j) * x1 );
5146 SIMDType xmm6( A.load(i+5UL,j) * x1 );
5147 SIMDType xmm7( A.load(i+6UL,j) * x1 );
5148 SIMDType xmm8( A.load(i+7UL,j) * x1 );
5152 xmm1 += A.load(i ,j) * x1;
5153 xmm2 += A.load(i+1UL,j) * x1;
5154 xmm3 += A.load(i+2UL,j) * x1;
5155 xmm4 += A.load(i+3UL,j) * x1;
5156 xmm5 += A.load(i+4UL,j) * x1;
5157 xmm6 += A.load(i+5UL,j) * x1;
5158 xmm7 += A.load(i+6UL,j) * x1;
5159 xmm8 += A.load(i+7UL,j) * x1;
5162 y[i ] -=
sum( xmm1 ) * scalar;
5163 y[i+1UL] -=
sum( xmm2 ) * scalar;
5164 y[i+2UL] -=
sum( xmm3 ) * scalar;
5165 y[i+3UL] -=
sum( xmm4 ) * scalar;
5166 y[i+4UL] -=
sum( xmm5 ) * scalar;
5167 y[i+5UL] -=
sum( xmm6 ) * scalar;
5168 y[i+6UL] -=
sum( xmm7 ) * scalar;
5169 y[i+7UL] -=
sum( xmm8 ) * scalar;
5171 for( ; remainder && j<jend; ++j ) {
5172 y[i ] -= A(i ,j) * x[j] * scalar;
5173 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5174 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5175 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5176 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
5177 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
5178 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
5179 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
5193 for( ++j; j<jend; ++j ) {
5194 value1 += A(i ,j) * x[j];
5195 value2 += A(i+1UL,j) * x[j];
5196 value3 += A(i+2UL,j) * x[j];
5197 value4 += A(i+3UL,j) * x[j];
5198 value5 += A(i+4UL,j) * x[j];
5199 value6 += A(i+5UL,j) * x[j];
5200 value7 += A(i+6UL,j) * x[j];
5201 value8 += A(i+7UL,j) * x[j];
5204 y[i ] -= value1 * scalar;
5205 y[i+1UL] -= value2 * scalar;
5206 y[i+2UL] -= value3 * scalar;
5207 y[i+3UL] -= value4 * scalar;
5208 y[i+4UL] -= value5 * scalar;
5209 y[i+5UL] -= value6 * scalar;
5210 y[i+6UL] -= value7 * scalar;
5211 y[i+7UL] -= value8 * scalar;
5215 for( ; (i+4UL) <= M; i+=4UL )
5217 const size_t jbegin( ( IsUpper_v<MT1> )
5220 const size_t jend( ( IsLower_v<MT1> )
5221 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
5232 SIMDType x1( x.load(j) );
5233 SIMDType xmm1( A.load(i ,j) * x1 );
5234 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5235 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5236 SIMDType xmm4( A.load(i+3UL,j) * x1 );
5240 xmm1 += A.load(i ,j) * x1;
5241 xmm2 += A.load(i+1UL,j) * x1;
5242 xmm3 += A.load(i+2UL,j) * x1;
5243 xmm4 += A.load(i+3UL,j) * x1;
5246 y[i ] -=
sum( xmm1 ) * scalar;
5247 y[i+1UL] -=
sum( xmm2 ) * scalar;
5248 y[i+2UL] -=
sum( xmm3 ) * scalar;
5249 y[i+3UL] -=
sum( xmm4 ) * scalar;
5251 for( ; remainder && j<jend; ++j ) {
5252 y[i ] -= A(i ,j) * x[j] * scalar;
5253 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5254 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5255 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5265 for( ++j; j<jend; ++j ) {
5266 value1 += A(i ,j) * x[j];
5267 value2 += A(i+1UL,j) * x[j];
5268 value3 += A(i+2UL,j) * x[j];
5269 value4 += A(i+3UL,j) * x[j];
5272 y[i ] -= value1 * scalar;
5273 y[i+1UL] -= value2 * scalar;
5274 y[i+2UL] -= value3 * scalar;
5275 y[i+3UL] -= value4 * scalar;
5279 for( ; (i+3UL) <= M; i+=3UL )
5281 const size_t jbegin( ( IsUpper_v<MT1> )
5284 const size_t jend( ( IsLower_v<MT1> )
5285 ?( IsStrictlyLower_v<MT1> ? i+2UL : i+3UL )
5296 SIMDType x1( x.load(j) );
5297 SIMDType xmm1( A.load(i ,j) * x1 );
5298 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5299 SIMDType xmm3( A.load(i+2UL,j) * x1 );
5303 xmm1 += A.load(i ,j) * x1;
5304 xmm2 += A.load(i+1UL,j) * x1;
5305 xmm3 += A.load(i+2UL,j) * x1;
5308 y[i ] -=
sum( xmm1 ) * scalar;
5309 y[i+1UL] -=
sum( xmm2 ) * scalar;
5310 y[i+2UL] -=
sum( xmm3 ) * scalar;
5312 for( ; remainder && j<jend; ++j ) {
5313 y[i ] -= A(i ,j) * x[j] * scalar;
5314 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5315 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5324 for( ++j; j<jend; ++j ) {
5325 value1 += A(i ,j) * x[j];
5326 value2 += A(i+1UL,j) * x[j];
5327 value3 += A(i+2UL,j) * x[j];
5330 y[i ] -= value1 * scalar;
5331 y[i+1UL] -= value2 * scalar;
5332 y[i+2UL] -= value3 * scalar;
5336 for( ; (i+2UL) <= M; i+=2UL )
5338 const size_t jbegin( ( IsUpper_v<MT1> )
5341 const size_t jend( ( IsLower_v<MT1> )
5342 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
5353 SIMDType x1( x.load(j) );
5354 SIMDType xmm1( A.load(i ,j) * x1 );
5355 SIMDType xmm2( A.load(i+1UL,j) * x1 );
5359 xmm1 += A.load(i ,j) * x1;
5360 xmm2 += A.load(i+1UL,j) * x1;
5363 y[i ] -=
sum( xmm1 ) * scalar;
5364 y[i+1UL] -=
sum( xmm2 ) * scalar;
5366 for( ; remainder && j<jend; ++j ) {
5367 y[i ] -= A(i ,j) * x[j] * scalar;
5368 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5376 for( ++j; j<jend; ++j ) {
5377 value1 += A(i ,j) * x[j];
5378 value2 += A(i+1UL,j) * x[j];
5381 y[i ] -= value1 * scalar;
5382 y[i+1UL] -= value2 * scalar;
5388 const size_t jbegin( ( IsUpper_v<MT1> )
5391 const size_t jend( ( IsLower_v<MT1> )
5392 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
5403 SIMDType xmm1( A.load(i,j) * x.load(j) );
5406 xmm1 += A.load(i,j) * x.load(j);
5409 y[i] -=
sum( xmm1 ) * scalar;
5411 for( ; remainder && j<jend; ++j ) {
5412 y[i] -= A(i,j) * x[j] * scalar;
5419 for( ++j; j<jend; ++j ) {
5420 value += A(i,j) * x[j];
5423 y[i] -= value * scalar;
5443 template<
typename VT1
5447 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5448 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5450 selectDefaultSubAssignKernel( y, A, x, scalar );
5468 template<
typename VT1
5472 static inline auto selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5473 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,MT1,VT2,ST2> >
5475 constexpr bool remainder( !IsPadded_v<MT1> || !IsPadded_v<VT2> );
5477 const size_t M( A.rows() );
5478 const size_t N( A.columns() );
5482 for( ; (i+8UL) <= M; i+=8UL )
5484 const size_t jbegin( ( IsUpper_v<MT1> )
5487 const size_t jend( ( IsLower_v<MT1> )
5488 ?( IsStrictlyLower_v<MT1> ? i+7UL : i+8UL )
5501 const SIMDType x1( x.load(j ) );
5502 const SIMDType x2( x.load(j1) );
5503 const SIMDType x3( x.load(j2) );
5504 const SIMDType x4( x.load(j3) );
5505 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5506 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5507 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
5508 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
5509 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
5510 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
5511 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
5512 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
5517 const SIMDType x1( x.load(j ) );
5518 const SIMDType x2( x.load(j1) );
5519 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5520 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5521 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
5522 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
5523 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
5524 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
5525 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
5526 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
5530 const SIMDType x1( x.load(j) );
5531 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
5532 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
5533 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
5534 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
5535 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
5536 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
5537 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
5538 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
5541 for( ; remainder && j<jend; ++j ) {
5542 y[i ] -= A(i ,j) * x[j] * scalar;
5543 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5544 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5545 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5546 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
5547 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
5548 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
5549 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
5553 for( ; (i+4UL) <= M; i+=4UL )
5555 const size_t jbegin( ( IsUpper_v<MT1> )
5558 const size_t jend( ( IsLower_v<MT1> )
5559 ?( IsStrictlyLower_v<MT1> ? i+3UL : i+4UL )
5572 const SIMDType x1( x.load(j ) );
5573 const SIMDType x2( x.load(j1) );
5574 const SIMDType x3( x.load(j2) );
5575 const SIMDType x4( x.load(j3) );
5576 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5577 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5578 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
5579 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
5584 const SIMDType x1( x.load(j ) );
5585 const SIMDType x2( x.load(j1) );
5586 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5587 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5588 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
5589 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
5593 const SIMDType x1( x.load(j) );
5594 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
5595 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
5596 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
5597 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
5600 for( ; remainder && j<jend; ++j ) {
5601 y[i ] -= A(i ,j) * x[j] * scalar;
5602 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5603 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
5604 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
5608 for( ; (i+2UL) <= M; i+=2UL )
5610 const size_t jbegin( ( IsUpper_v<MT1> )
5613 const size_t jend( ( IsLower_v<MT1> )
5614 ?( IsStrictlyLower_v<MT1> ? i+1UL : i+2UL )
5627 const SIMDType x1( x.load(j ) );
5628 const SIMDType x2( x.load(j1) );
5629 const SIMDType x3( x.load(j2) );
5630 const SIMDType x4( x.load(j3) );
5631 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
5632 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
5637 const SIMDType x1( x.load(j ) );
5638 const SIMDType x2( x.load(j1) );
5639 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
5640 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
5644 const SIMDType x1( x.load(j) );
5645 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
5646 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
5649 for( ; remainder && j<jend; ++j ) {
5650 y[i ] -= A(i ,j) * x[j] * scalar;
5651 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
5657 const size_t jbegin( ( IsUpper_v<MT1> )
5660 const size_t jend( ( IsLower_v<MT1> )
5661 ?( IsStrictlyLower_v<MT1> ? i : i+1UL )
5674 const SIMDType x1( x.load(j ) );
5675 const SIMDType x2( x.load(j1) );
5676 const SIMDType x3( x.load(j2) );
5677 const SIMDType x4( x.load(j3) );
5678 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
5683 const SIMDType x1( x.load(j ) );
5684 const SIMDType x2( x.load(j1) );
5685 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
5689 const SIMDType x1( x.load(j) );
5690 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
5693 for( ; remainder && j<jend; ++j ) {
5694 y[i] -= A(i,j) * x[j] * scalar;
5714 template<
typename VT1
5718 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5719 -> DisableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
5721 selectLargeSubAssignKernel( y, A, x, scalar );
5726#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
5740 template<
typename VT1
5744 static inline auto selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5745 -> EnableIf_t< UseBlasKernel_v<VT1,MT1,VT2,ST2> >
5747 using ET = ElementType_t<VT1>;
5749 if( IsTriangular_v<MT1> ) {
5750 ResultType_t<VT1> tmp(
serial( scalar * x ) );
5751 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
5752 subAssign( y, tmp );
5755 gemv( y, A, x,
ET(-scalar),
ET(1) );
5777 template<
typename VT1 >
5778 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5789 multAssign( *lhs, tmp );
5809 template<
typename VT1 >
5810 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5821 divAssign( *lhs, tmp );
5843 template<
typename VT1 >
5845 -> EnableIf_t< UseSMPAssign_v<VT1> >
5851 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5852 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5854 if( left.rows() == 0UL ) {
5857 else if( left.columns() == 0UL ||
5858 ( IsStrictlyTriangular_v<MT> && left.columns() == 1UL ) ) {
5889 template<
typename VT1 >
5891 -> EnableIf_t< UseSMPAssign_v<VT1> >
5920 template<
typename VT1 >
5922 -> EnableIf_t< UseSMPAssign_v<VT1> >
5928 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5929 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5931 if( left.rows() == 0UL || left.columns() == 0UL ||
5932 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
5966 template<
typename VT1 >
5968 -> EnableIf_t< UseSMPAssign_v<VT1> >
5974 LeftOperand_t<MVM> left ( rhs.vector_.leftOperand() );
5975 RightOperand_t<MVM> right( rhs.vector_.rightOperand() );
5977 if( left.rows() == 0UL || left.columns() == 0UL ||
5978 ( IsStrictlyTriangular_v<MT> && left.rows() == 1UL ) ) {
6012 template<
typename VT1 >
6014 -> EnableIf_t< UseSMPAssign_v<VT1> >
6047 template<
typename VT1 >
6049 -> EnableIf_t< UseSMPAssign_v<VT1> >
6121template<
typename MT
6123inline decltype(
auto)
6130 if( (*mat).columns() != (*vec).size() ) {
6135 return ReturnType( *mat, *vec );
6150template<
typename MT,
typename VT >
6151struct IsAligned< DMatDVecMultExpr<MT,VT> >
6152 :
public BoolConstant< IsAligned_v<MT> && IsAligned_v<VT> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraint on the transpose flag of vector types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraints on the storage order of matrix types.
Header file for all SIMD functionality.
Constraint on the data type.
Expression object for dense matrix-dense vector multiplications.
Definition: DMatDVecMultExpr.h:127
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DMatDVecMultExpr.h:231
If_t< IsExpression_v< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:217
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DMatDVecMultExpr.h:244
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:267
If_t< IsExpression_v< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:220
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:391
CompositeType_t< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:135
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:140
DMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:253
If_t< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:223
CompositeType_t< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:134
ElementType_t< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:133
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:378
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:390
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:334
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:358
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:214
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:210
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:211
static constexpr bool evaluateVector
Compilation switch for the composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:147
ElementType_t< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:132
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:301
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DMatDVecMultExpr.h:238
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:213
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:346
ResultType_t< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:130
ResultType_t< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:131
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:368
If_t< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:226
MultTrait_t< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:209
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: DMatDVecMultExpr.h:212
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:324
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:314
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the MatMatMultExpr base class.
Header file for the MatVecMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: RowMajorMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.
Definition: MatVecMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.
Definition: ColumnVector.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
decltype(auto) row(Matrix< MT, SO > &, RRAs...)
Creating a view on a specific row of the given matrix.
Definition: Row.h:137
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all matrix/vector multiplication expression templates.
Definition: MatVecMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.