35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_ 118 template<
typename VT
120 class TDVecTDMatMultExpr
121 :
public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
122 ,
private Computation
136 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
142 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
143 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
152 template<
typename T1 >
162 template<
typename T1,
typename T2,
typename T3 >
163 static constexpr
bool UseBlasKernel_v =
165 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
166 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
167 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
169 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
170 IsBLASCompatible_v< ElementType_t<T1> > &&
171 IsBLASCompatible_v< ElementType_t<T2> > &&
172 IsBLASCompatible_v< ElementType_t<T3> > &&
184 template<
typename T1,
typename T2,
typename T3 >
185 static constexpr
bool UseVectorizedDefaultKernel_v =
186 ( useOptimizedKernels &&
188 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
189 IsSIMDCombinable_v< ElementType_t<T1>
224 ( !IsDiagonal_v<MT> &&
225 VT::simdEnabled && MT::simdEnabled &&
226 HasSIMDAdd_v<VET,MET> &&
227 HasSIMDMult_v<VET,MET> );
262 if( IsDiagonal_v<MT> )
264 return vec_[index] *
mat_(index,index);
266 else if( IsLower_v<MT> && ( index > 8UL ) )
268 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
273 else if( IsUpper_v<MT> && ( index + 8UL <
mat_.rows() ) )
275 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
294 if( index >=
mat_.columns() ) {
297 return (*
this)[index];
306 inline size_t size() const noexcept {
307 return mat_.columns();
337 template<
typename T >
338 inline bool canAlias(
const T* alias )
const noexcept {
339 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
349 template<
typename T >
350 inline bool isAliased(
const T* alias )
const noexcept {
351 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
361 return vec_.isAligned() &&
mat_.isAligned();
375 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
376 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
399 template<
typename VT1 >
406 if( rhs.mat_.rows() == 0UL ) {
410 else if( rhs.mat_.columns() == 0UL ) {
422 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
438 template<
typename VT1
441 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
443 if( ( IsDiagonal_v<MT1> ) ||
445 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
446 selectSmallAssignKernel( y, x, A );
448 selectBlasAssignKernel( y, x, A );
467 template<
typename VT1
470 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
491 template<
typename VT1
494 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
495 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
497 selectDefaultAssignKernel( y, x, A );
516 template<
typename VT1
519 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
520 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
522 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
524 const size_t M( A.rows() );
525 const size_t N( A.columns() );
529 for( ; (j+8UL) <= N; j+=8UL )
531 const size_t ibegin( ( IsLower_v<MT1> )
532 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
534 const size_t iend( ( IsUpper_v<MT1> )
535 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
539 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
542 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
547 xmm1 += x1 * A.load(i,j );
548 xmm2 += x1 * A.load(i,j+1UL);
549 xmm3 += x1 * A.load(i,j+2UL);
550 xmm4 += x1 * A.load(i,j+3UL);
551 xmm5 += x1 * A.load(i,j+4UL);
552 xmm6 += x1 * A.load(i,j+5UL);
553 xmm7 += x1 * A.load(i,j+6UL);
554 xmm8 += x1 * A.load(i,j+7UL);
558 y[j+1UL] =
sum( xmm2 );
559 y[j+2UL] =
sum( xmm3 );
560 y[j+3UL] =
sum( xmm4 );
561 y[j+4UL] =
sum( xmm5 );
562 y[j+5UL] =
sum( xmm6 );
563 y[j+6UL] =
sum( xmm7 );
564 y[j+7UL] =
sum( xmm8 );
566 for( ; remainder && i<iend; ++i ) {
567 y[j ] += x[i] * A(i,j );
568 y[j+1UL] += x[i] * A(i,j+1UL);
569 y[j+2UL] += x[i] * A(i,j+2UL);
570 y[j+3UL] += x[i] * A(i,j+3UL);
571 y[j+4UL] += x[i] * A(i,j+4UL);
572 y[j+5UL] += x[i] * A(i,j+5UL);
573 y[j+6UL] += x[i] * A(i,j+6UL);
574 y[j+7UL] += x[i] * A(i,j+7UL);
578 for( ; (j+4UL) <= N; j+=4UL )
580 const size_t ibegin( ( IsLower_v<MT1> )
581 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
583 const size_t iend( ( IsUpper_v<MT1> )
584 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
588 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
596 xmm1 += x1 * A.load(i,j );
597 xmm2 += x1 * A.load(i,j+1UL);
598 xmm3 += x1 * A.load(i,j+2UL);
599 xmm4 += x1 * A.load(i,j+3UL);
603 y[j+1UL] =
sum( xmm2 );
604 y[j+2UL] =
sum( xmm3 );
605 y[j+3UL] =
sum( xmm4 );
607 for( ; remainder && i<iend; ++i ) {
608 y[j ] += x[i] * A(i,j );
609 y[j+1UL] += x[i] * A(i,j+1UL);
610 y[j+2UL] += x[i] * A(i,j+2UL);
611 y[j+3UL] += x[i] * A(i,j+3UL);
615 for( ; (j+3UL) <= N; j+=3UL )
617 const size_t ibegin( ( IsLower_v<MT1> )
618 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
620 const size_t iend( ( IsUpper_v<MT1> )
621 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
625 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
633 xmm1 += x1 * A.load(i,j );
634 xmm2 += x1 * A.load(i,j+1UL);
635 xmm3 += x1 * A.load(i,j+2UL);
639 y[j+1UL] =
sum( xmm2 );
640 y[j+2UL] =
sum( xmm3 );
642 for( ; remainder && i<iend; ++i ) {
643 y[j ] += x[i] * A(i,j );
644 y[j+1UL] += x[i] * A(i,j+1UL);
645 y[j+2UL] += x[i] * A(i,j+2UL);
649 for( ; (j+2UL) <= N; j+=2UL )
651 const size_t ibegin( ( IsLower_v<MT1> )
652 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
654 const size_t iend( ( IsUpper_v<MT1> )
655 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
659 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
667 xmm1 += x1 * A.load(i,j );
668 xmm2 += x1 * A.load(i,j+1UL);
672 y[j+1UL] =
sum( xmm2 );
674 for( ; remainder && i<iend; ++i ) {
675 y[j ] += x[i] * A(i,j );
676 y[j+1UL] += x[i] * A(i,j+1UL);
682 const size_t ibegin( ( IsLower_v<MT1> )
683 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
685 const size_t iend( ( IsUpper_v<MT1> )
686 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
690 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
697 xmm1 += x.load(i) * A.load(i,j);
702 for( ; remainder && i<iend; ++i ) {
703 y[j] += x[i] * A(i,j);
724 template<
typename VT1
727 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
728 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
730 selectDefaultAssignKernel( y, x, A );
749 template<
typename VT1
752 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
753 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
755 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
757 const size_t M( A.rows() );
758 const size_t N( A.columns() );
764 for( ; (j+8UL) <= N; j+=8UL )
766 const size_t ibegin( ( IsLower_v<MT1> )
767 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
769 const size_t iend( ( IsUpper_v<MT1> )
770 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
774 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
787 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
788 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
789 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
790 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
791 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
792 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
793 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
794 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
801 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
802 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
803 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
804 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
805 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
806 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
807 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
808 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
813 y[j ] +=
sum( x1 * A.load(i,j ) );
814 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
815 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
816 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
817 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
818 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
819 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
820 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
823 for( ; remainder && i<iend; ++i ) {
824 y[j ] += x[i] * A(i,j );
825 y[j+1UL] += x[i] * A(i,j+1UL);
826 y[j+2UL] += x[i] * A(i,j+2UL);
827 y[j+3UL] += x[i] * A(i,j+3UL);
828 y[j+4UL] += x[i] * A(i,j+4UL);
829 y[j+5UL] += x[i] * A(i,j+5UL);
830 y[j+6UL] += x[i] * A(i,j+6UL);
831 y[j+7UL] += x[i] * A(i,j+7UL);
835 for( ; (j+4UL) <= N; j+=4UL )
837 const size_t ibegin( ( IsLower_v<MT1> )
838 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
840 const size_t iend( ( IsUpper_v<MT1> )
841 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
845 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
858 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
859 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
860 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
861 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
868 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
869 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
870 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
871 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
876 y[j ] +=
sum( x1 * A.load(i,j ) );
877 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
878 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
879 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
882 for( ; remainder && i<iend; ++i ) {
883 y[j ] += x[i] * A(i,j );
884 y[j+1UL] += x[i] * A(i,j+1UL);
885 y[j+2UL] += x[i] * A(i,j+2UL);
886 y[j+3UL] += x[i] * A(i,j+3UL);
890 for( ; (j+2UL) <= N; j+=2UL )
892 const size_t ibegin( ( IsLower_v<MT1> )
893 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
895 const size_t iend( ( IsUpper_v<MT1> )
896 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
900 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
913 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
914 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
921 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
922 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
927 y[j ] +=
sum( x1 * A.load(i,j ) );
928 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
931 for( ; remainder && i<iend; ++i ) {
932 y[j ] += x[i] * A(i,j );
933 y[j+1UL] += x[i] * A(i,j+1UL);
939 const size_t ibegin( ( IsLower_v<MT1> )
940 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
942 const size_t iend( ( IsUpper_v<MT1> )
943 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
947 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
960 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
967 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
972 y[j] +=
sum( x1 * A.load(i,j) );
975 for( ; remainder && i<iend; ++i ) {
976 y[j] += x[i] * A(i,j);
997 template<
typename VT1
1000 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1001 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1003 selectLargeAssignKernel( y, x, A );
1009 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1023 template<
typename VT1
1026 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1027 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1029 using ET = ElementType_t<VT1>;
1031 if( IsTriangular_v<MT1> ) {
1033 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1036 gemv( y, x, A, ET(1), ET(0) );
1056 template<
typename VT1 >
1057 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1068 assign( ~lhs, tmp );
1086 template<
typename VT1 >
1087 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1093 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1105 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1121 template<
typename VT1
1124 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1126 if( ( IsDiagonal_v<MT1> ) ||
1128 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1129 selectSmallAddAssignKernel( y, x, A );
1131 selectBlasAddAssignKernel( y, x, A );
1150 template<
typename VT1
1153 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1155 y.addAssign( x * A );
1174 template<
typename VT1
1177 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1178 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1180 selectDefaultAddAssignKernel( y, x, A );
1200 template<
typename VT1
1203 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1204 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1206 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1208 const size_t M( A.rows() );
1209 const size_t N( A.columns() );
1213 for( ; (j+8UL) <= N; j+=8UL )
1215 const size_t ibegin( ( IsLower_v<MT1> )
1216 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1218 const size_t iend( ( IsUpper_v<MT1> )
1219 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1223 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1226 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1231 xmm1 += x1 * A.load(i,j );
1232 xmm2 += x1 * A.load(i,j+1UL);
1233 xmm3 += x1 * A.load(i,j+2UL);
1234 xmm4 += x1 * A.load(i,j+3UL);
1235 xmm5 += x1 * A.load(i,j+4UL);
1236 xmm6 += x1 * A.load(i,j+5UL);
1237 xmm7 += x1 * A.load(i,j+6UL);
1238 xmm8 += x1 * A.load(i,j+7UL);
1241 y[j ] +=
sum( xmm1 );
1242 y[j+1UL] +=
sum( xmm2 );
1243 y[j+2UL] +=
sum( xmm3 );
1244 y[j+3UL] +=
sum( xmm4 );
1245 y[j+4UL] +=
sum( xmm5 );
1246 y[j+5UL] +=
sum( xmm6 );
1247 y[j+6UL] +=
sum( xmm7 );
1248 y[j+7UL] +=
sum( xmm8 );
1250 for( ; remainder && i<iend; ++i ) {
1251 y[j ] += x[i] * A(i,j );
1252 y[j+1UL] += x[i] * A(i,j+1UL);
1253 y[j+2UL] += x[i] * A(i,j+2UL);
1254 y[j+3UL] += x[i] * A(i,j+3UL);
1255 y[j+4UL] += x[i] * A(i,j+4UL);
1256 y[j+5UL] += x[i] * A(i,j+5UL);
1257 y[j+6UL] += x[i] * A(i,j+6UL);
1258 y[j+7UL] += x[i] * A(i,j+7UL);
1262 for( ; (j+4UL) <= N; j+=4UL )
1264 const size_t ibegin( ( IsLower_v<MT1> )
1265 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1267 const size_t iend( ( IsUpper_v<MT1> )
1268 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1272 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1280 xmm1 += x1 * A.load(i,j );
1281 xmm2 += x1 * A.load(i,j+1UL);
1282 xmm3 += x1 * A.load(i,j+2UL);
1283 xmm4 += x1 * A.load(i,j+3UL);
1286 y[j ] +=
sum( xmm1 );
1287 y[j+1UL] +=
sum( xmm2 );
1288 y[j+2UL] +=
sum( xmm3 );
1289 y[j+3UL] +=
sum( xmm4 );
1291 for( ; remainder && i<iend; ++i ) {
1292 y[j ] += x[i] * A(i,j );
1293 y[j+1UL] += x[i] * A(i,j+1UL);
1294 y[j+2UL] += x[i] * A(i,j+2UL);
1295 y[j+3UL] += x[i] * A(i,j+3UL);
1299 for( ; (j+3UL) <= N; j+=3UL )
1301 const size_t ibegin( ( IsLower_v<MT1> )
1302 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1304 const size_t iend( ( IsUpper_v<MT1> )
1305 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1309 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1317 xmm1 += x1 * A.load(i,j );
1318 xmm2 += x1 * A.load(i,j+1UL);
1319 xmm3 += x1 * A.load(i,j+2UL);
1322 y[j ] +=
sum( xmm1 );
1323 y[j+1UL] +=
sum( xmm2 );
1324 y[j+2UL] +=
sum( xmm3 );
1326 for( ; remainder && i<iend; ++i ) {
1327 y[j ] += x[i] * A(i,j );
1328 y[j+1UL] += x[i] * A(i,j+1UL);
1329 y[j+2UL] += x[i] * A(i,j+2UL);
1333 for( ; (j+2UL) <= N; j+=2UL )
1335 const size_t ibegin( ( IsLower_v<MT1> )
1336 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1338 const size_t iend( ( IsUpper_v<MT1> )
1339 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1343 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1351 xmm1 += x1 * A.load(i,j );
1352 xmm2 += x1 * A.load(i,j+1UL);
1355 y[j ] +=
sum( xmm1 );
1356 y[j+1UL] +=
sum( xmm2 );
1358 for( ; remainder && i<iend; ++i ) {
1359 y[j ] += x[i] * A(i,j );
1360 y[j+1UL] += x[i] * A(i,j+1UL);
1366 const size_t ibegin( ( IsLower_v<MT1> )
1367 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1369 const size_t iend( ( IsUpper_v<MT1> )
1370 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1374 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1381 xmm1 += A.load(i,j) * x.load(i);
1384 y[j] +=
sum( xmm1 );
1386 for( ; remainder && i<iend; ++i ) {
1387 y[j] += x[i] * A(i,j);
1408 template<
typename VT1
1411 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1412 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1414 selectDefaultAddAssignKernel( y, x, A );
1434 template<
typename VT1
1437 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1438 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1440 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1442 const size_t M( A.rows() );
1443 const size_t N( A.columns() );
1447 for( ; (j+8UL) <= N; j+=8UL )
1449 const size_t ibegin( ( IsLower_v<MT1> )
1450 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1452 const size_t iend( ( IsUpper_v<MT1> )
1453 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1457 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1470 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1471 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1472 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1473 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1474 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1475 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1476 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1477 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1484 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1485 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1486 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1487 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1488 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1489 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1490 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1491 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1496 y[j ] +=
sum( x1 * A.load(i,j ) );
1497 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1498 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1499 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1500 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1501 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1502 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1503 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1506 for( ; remainder && i<iend; ++i ) {
1507 y[j ] += x[i] * A(i,j );
1508 y[j+1UL] += x[i] * A(i,j+1UL);
1509 y[j+2UL] += x[i] * A(i,j+2UL);
1510 y[j+3UL] += x[i] * A(i,j+3UL);
1511 y[j+4UL] += x[i] * A(i,j+4UL);
1512 y[j+5UL] += x[i] * A(i,j+5UL);
1513 y[j+6UL] += x[i] * A(i,j+6UL);
1514 y[j+7UL] += x[i] * A(i,j+7UL);
1518 for( ; (j+4UL) <= N; j+=4UL )
1520 const size_t ibegin( ( IsLower_v<MT1> )
1521 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1523 const size_t iend( ( IsUpper_v<MT1> )
1524 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1528 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1541 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1542 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1543 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1544 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1551 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1552 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1553 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1554 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1559 y[j ] +=
sum( x1 * A.load(i,j ) );
1560 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1561 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1562 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1565 for( ; remainder && i<iend; ++i ) {
1566 y[j ] += x[i] * A(i,j );
1567 y[j+1UL] += x[i] * A(i,j+1UL);
1568 y[j+2UL] += x[i] * A(i,j+2UL);
1569 y[j+3UL] += x[i] * A(i,j+3UL);
1573 for( ; (j+2UL) <= N; j+=2UL )
1575 const size_t ibegin( ( IsLower_v<MT1> )
1576 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1578 const size_t iend( ( IsUpper_v<MT1> )
1579 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1583 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1596 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1597 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1604 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1605 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1610 y[j ] +=
sum( x1 * A.load(i,j ) );
1611 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1614 for( ; remainder && i<iend; ++i ) {
1615 y[j ] += x[i] * A(i,j );
1616 y[j+1UL] += x[i] * A(i,j+1UL);
1622 const size_t ibegin( ( IsLower_v<MT1> )
1623 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1625 const size_t iend( ( IsUpper_v<MT1> )
1626 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1630 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1643 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1650 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1655 y[j] +=
sum( x1 * A.load(i,j) );
1658 for( ; remainder && i<iend; ++i ) {
1659 y[j] += x[i] * A(i,j);
1680 template<
typename VT1
1683 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1684 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1686 selectLargeAddAssignKernel( y, x, A );
1692 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 1706 template<
typename VT1
1709 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1710 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1712 using ET = ElementType_t<VT1>;
1714 if( IsTriangular_v<MT1> ) {
1715 ResultType_t<VT1> tmp(
serial( x ) );
1716 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1717 addAssign( y, tmp );
1720 gemv( y, x, A, ET(1), ET(1) );
1744 template<
typename VT1 >
1745 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1751 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1763 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1779 template<
typename VT1
1782 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1784 if( ( IsDiagonal_v<MT1> ) ||
1786 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1787 selectSmallSubAssignKernel( y, x, A );
1789 selectBlasSubAssignKernel( y, x, A );
1808 template<
typename VT1
1811 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1813 y.subAssign( x * A );
1832 template<
typename VT1
1835 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1836 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1838 selectDefaultSubAssignKernel( y, x, A );
1858 template<
typename VT1
1861 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1862 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1864 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1866 const size_t M( A.rows() );
1867 const size_t N( A.columns() );
1871 for( ; (j+8UL) <= N; j+=8UL )
1873 const size_t ibegin( ( IsLower_v<MT1> )
1874 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1876 const size_t iend( ( IsUpper_v<MT1> )
1877 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1881 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1884 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1889 xmm1 += x1 * A.load(i,j );
1890 xmm2 += x1 * A.load(i,j+1UL);
1891 xmm3 += x1 * A.load(i,j+2UL);
1892 xmm4 += x1 * A.load(i,j+3UL);
1893 xmm5 += x1 * A.load(i,j+4UL);
1894 xmm6 += x1 * A.load(i,j+5UL);
1895 xmm7 += x1 * A.load(i,j+6UL);
1896 xmm8 += x1 * A.load(i,j+7UL);
1899 y[j ] -=
sum( xmm1 );
1900 y[j+1UL] -=
sum( xmm2 );
1901 y[j+2UL] -=
sum( xmm3 );
1902 y[j+3UL] -=
sum( xmm4 );
1903 y[j+4UL] -=
sum( xmm5 );
1904 y[j+5UL] -=
sum( xmm6 );
1905 y[j+6UL] -=
sum( xmm7 );
1906 y[j+7UL] -=
sum( xmm8 );
1908 for( ; remainder && i<iend; ++i ) {
1909 y[j ] -= x[i] * A(i,j );
1910 y[j+1UL] -= x[i] * A(i,j+1UL);
1911 y[j+2UL] -= x[i] * A(i,j+2UL);
1912 y[j+3UL] -= x[i] * A(i,j+3UL);
1913 y[j+4UL] -= x[i] * A(i,j+4UL);
1914 y[j+5UL] -= x[i] * A(i,j+5UL);
1915 y[j+6UL] -= x[i] * A(i,j+6UL);
1916 y[j+7UL] -= x[i] * A(i,j+7UL);
1920 for( ; (j+4UL) <= N; j+=4UL )
1922 const size_t ibegin( ( IsLower_v<MT1> )
1923 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1925 const size_t iend( ( IsUpper_v<MT1> )
1926 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1930 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1938 xmm1 += x1 * A.load(i,j );
1939 xmm2 += x1 * A.load(i,j+1UL);
1940 xmm3 += x1 * A.load(i,j+2UL);
1941 xmm4 += x1 * A.load(i,j+3UL);
1944 y[j ] -=
sum( xmm1 );
1945 y[j+1UL] -=
sum( xmm2 );
1946 y[j+2UL] -=
sum( xmm3 );
1947 y[j+3UL] -=
sum( xmm4 );
1949 for( ; remainder && i<iend; ++i ) {
1950 y[j ] -= x[i] * A(i,j );
1951 y[j+1UL] -= x[i] * A(i,j+1UL);
1952 y[j+2UL] -= x[i] * A(i,j+2UL);
1953 y[j+3UL] -= x[i] * A(i,j+3UL);
1957 for( ; (j+3UL) <= N; j+=3UL )
1959 const size_t ibegin( ( IsLower_v<MT1> )
1960 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1962 const size_t iend( ( IsUpper_v<MT1> )
1963 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1967 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
1975 xmm1 += x1 * A.load(i,j );
1976 xmm2 += x1 * A.load(i,j+1UL);
1977 xmm3 += x1 * A.load(i,j+2UL);
1980 y[j ] -=
sum( xmm1 );
1981 y[j+1UL] -=
sum( xmm2 );
1982 y[j+2UL] -=
sum( xmm3 );
1984 for( ; remainder && i<iend; ++i ) {
1985 y[j ] -= x[i] * A(i,j );
1986 y[j+1UL] -= x[i] * A(i,j+1UL);
1987 y[j+2UL] -= x[i] * A(i,j+2UL);
1991 for( ; (j+2UL) <= N; j+=2UL )
1993 const size_t ibegin( ( IsLower_v<MT1> )
1994 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
1996 const size_t iend( ( IsUpper_v<MT1> )
1997 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2001 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2009 xmm1 += x1 * A.load(i,j );
2010 xmm2 += x1 * A.load(i,j+1UL);
2013 y[j ] -=
sum( xmm1 );
2014 y[j+1UL] -=
sum( xmm2 );
2016 for( ; remainder && i<iend; ++i ) {
2017 y[j ] -= x[i] * A(i,j );
2018 y[j+1UL] -= x[i] * A(i,j+1UL);
2024 const size_t ibegin( ( IsLower_v<MT1> )
2025 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
2027 const size_t iend( ( IsUpper_v<MT1> )
2028 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2032 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2039 xmm1 += A.load(i,j) * x.load(i);
2042 y[j] -=
sum( xmm1 );
2044 for( ; remainder && i<iend; ++i ) {
2045 y[j] -= x[i] * A(i,j);
2066 template<
typename VT1
2069 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2070 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2072 selectDefaultSubAssignKernel( y, x, A );
2092 template<
typename VT1
2095 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2096 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2098 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2100 const size_t M( A.rows() );
2101 const size_t N( A.columns() );
2105 for( ; (j+8UL) <= N; j+=8UL )
2107 const size_t ibegin( ( IsLower_v<MT1> )
2108 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
2110 const size_t iend( ( IsUpper_v<MT1> )
2111 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2115 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2128 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2129 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2130 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2131 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2132 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2133 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2134 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2135 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2142 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2143 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2144 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2145 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2146 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2147 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2148 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2149 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2154 y[j ] -=
sum( x1 * A.load(i,j ) );
2155 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2156 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2157 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2158 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2159 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2160 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2161 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2164 for( ; remainder && i<iend; ++i ) {
2165 y[j ] -= x[i] * A(i,j );
2166 y[j+1UL] -= x[i] * A(i,j+1UL);
2167 y[j+2UL] -= x[i] * A(i,j+2UL);
2168 y[j+3UL] -= x[i] * A(i,j+3UL);
2169 y[j+4UL] -= x[i] * A(i,j+4UL);
2170 y[j+5UL] -= x[i] * A(i,j+5UL);
2171 y[j+6UL] -= x[i] * A(i,j+6UL);
2172 y[j+7UL] -= x[i] * A(i,j+7UL);
2176 for( ; (j+4UL) <= N; j+=4UL )
2178 const size_t ibegin( ( IsLower_v<MT1> )
2179 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
2181 const size_t iend( ( IsUpper_v<MT1> )
2182 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2186 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2199 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2200 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2201 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2202 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2209 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2210 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2211 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2212 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2217 y[j ] -=
sum( x1 * A.load(i,j ) );
2218 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2219 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2220 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2223 for( ; remainder && i<iend; ++i ) {
2224 y[j ] -= x[i] * A(i,j );
2225 y[j+1UL] -= x[i] * A(i,j+1UL);
2226 y[j+2UL] -= x[i] * A(i,j+2UL);
2227 y[j+3UL] -= x[i] * A(i,j+3UL);
2231 for( ; (j+2UL) <= N; j+=2UL )
2233 const size_t ibegin( ( IsLower_v<MT1> )
2234 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
2236 const size_t iend( ( IsUpper_v<MT1> )
2237 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2241 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2254 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2255 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2262 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2263 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2268 y[j ] -=
sum( x1 * A.load(i,j ) );
2269 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2272 for( ; remainder && i<iend; ++i ) {
2273 y[j ] -= x[i] * A(i,j );
2274 y[j+1UL] -= x[i] * A(i,j+1UL);
2280 const size_t ibegin( ( IsLower_v<MT1> )
2281 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
2283 const size_t iend( ( IsUpper_v<MT1> )
2284 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2288 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
2301 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2308 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2313 y[j] -=
sum( x1 * A.load(i,j) );
2316 for( ; remainder && i<iend; ++i ) {
2317 y[j] -= x[i] * A(i,j);
2338 template<
typename VT1
2341 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2342 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2344 selectLargeSubAssignKernel( y, x, A );
2350 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 2364 template<
typename VT1
2367 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2368 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2370 using ET = ElementType_t<VT1>;
2372 if( IsTriangular_v<MT1> ) {
2373 ResultType_t<VT1> tmp(
serial( x ) );
2374 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2375 subAssign( y, tmp );
2378 gemv( y, x, A, ET(-1), ET(1) );
2402 template<
typename VT1 >
2403 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2414 multAssign( ~lhs, tmp );
2436 template<
typename VT1 >
2437 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2448 divAssign( ~lhs, tmp );
2472 template<
typename VT1 >
2474 -> EnableIf_t< UseSMPAssign_v<VT1> >
2480 if( rhs.mat_.rows() == 0UL ) {
2484 else if( rhs.mat_.columns() == 0UL ) {
2516 template<
typename VT1 >
2518 -> EnableIf_t< UseSMPAssign_v<VT1> >
2549 template<
typename VT1 >
2551 -> EnableIf_t< UseSMPAssign_v<VT1> >
2557 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2593 template<
typename VT1 >
2595 -> EnableIf_t< UseSMPAssign_v<VT1> >
2601 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2637 template<
typename VT1 >
2639 -> EnableIf_t< UseSMPAssign_v<VT1> >
2674 template<
typename VT1 >
2676 -> EnableIf_t< UseSMPAssign_v<VT1> >
2725 template<
typename VT
2728 class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
2729 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
2730 ,
private Computation
2734 using VMM = TDVecTDMatMultExpr<VT,MT>;
2735 using RES = ResultType_t<VMM>;
2736 using VRT = ResultType_t<VT>;
2737 using MRT = ResultType_t<MT>;
2738 using VET = ElementType_t<VRT>;
2739 using MET = ElementType_t<MRT>;
2740 using VCT = CompositeType_t<VT>;
2741 using MCT = CompositeType_t<MT>;
2746 static constexpr
bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
2751 static constexpr
bool evaluateMatrix =
2752 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
2753 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
2760 template<
typename T1 >
2761 static constexpr
bool UseSMPAssign_v =
2762 ( T1::smpAssignable && ( evaluateVector || evaluateMatrix ) );
2769 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2770 static constexpr
bool UseBlasKernel_v =
2772 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
2773 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
2774 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
2775 !IsDiagonal_v<T3> &&
2776 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2777 IsBLASCompatible_v< ElementType_t<T1> > &&
2778 IsBLASCompatible_v< ElementType_t<T2> > &&
2779 IsBLASCompatible_v< ElementType_t<T3> > &&
2780 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
2781 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
2782 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
2790 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2791 static constexpr
bool UseVectorizedDefaultKernel_v =
2792 ( useOptimizedKernels &&
2793 !IsDiagonal_v<T3> &&
2794 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2795 IsSIMDCombinable_v< ElementType_t<T1>
2799 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
2800 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
2805 using This = DVecScalarMultExpr<VMM,ST,true>;
2806 using BaseType = DenseVector<This,true>;
2810 using SIMDType = SIMDTrait_t<ElementType>;
2815 using LeftOperand =
const TDVecTDMatMultExpr<VT,MT>;
2821 using LT = If_t< evaluateVector, const VRT, VCT >;
2824 using RT = If_t< evaluateMatrix, const MRT, MCT >;
2830 ( !IsDiagonal_v<MT> &&
2831 VT::simdEnabled && MT::simdEnabled &&
2832 IsSIMDCombinable_v<VET,MET,ST> &&
2833 HasSIMDAdd_v<VET,MET> &&
2834 HasSIMDMult_v<VET,MET> );
2838 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
2878 if( index >=
vector_.size() ) {
2881 return (*
this)[index];
2890 inline size_t size()
const {
2921 template<
typename T >
2922 inline bool canAlias(
const T* alias )
const {
2923 return vector_.canAlias( alias );
2933 template<
typename T >
2934 inline bool isAliased(
const T* alias )
const {
2935 return vector_.isAliased( alias );
2955 RightOperand_t<VMM> A(
vector_.rightOperand() );
2959 ( IsComputation_v<MT> && !evaluateMatrix ) ||
2960 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2961 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
2983 template<
typename VT1
2991 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
2992 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
2994 if( right.rows() == 0UL ) {
2998 else if( right.columns() == 0UL ) {
3010 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3025 template<
typename VT1
3029 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3031 if( ( IsDiagonal_v<MT1> ) ||
3032 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3033 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3034 selectSmallAssignKernel( y, x, A, scalar );
3036 selectBlasAssignKernel( y, x, A, scalar );
3054 template<
typename VT1
3058 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3060 y.assign( x * A * scalar );
3078 template<
typename VT1
3082 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3083 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3085 selectDefaultAssignKernel( y, x, A, scalar );
3104 template<
typename VT1
3108 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3109 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3111 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3113 const size_t M( A.rows() );
3114 const size_t N( A.columns() );
3118 for( ; (j+8UL) <= N; j+=8UL )
3120 const size_t ibegin( ( IsLower_v<MT1> )
3121 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3123 const size_t iend( ( IsUpper_v<MT1> )
3124 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3128 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3131 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3135 const SIMDType x1( x.load(i) );
3136 xmm1 += x1 * A.load(i,j );
3137 xmm2 += x1 * A.load(i,j+1UL);
3138 xmm3 += x1 * A.load(i,j+2UL);
3139 xmm4 += x1 * A.load(i,j+3UL);
3140 xmm5 += x1 * A.load(i,j+4UL);
3141 xmm6 += x1 * A.load(i,j+5UL);
3142 xmm7 += x1 * A.load(i,j+6UL);
3143 xmm8 += x1 * A.load(i,j+7UL);
3146 y[j ] =
sum( xmm1 ) * scalar;
3147 y[j+1UL] =
sum( xmm2 ) * scalar;
3148 y[j+2UL] =
sum( xmm3 ) * scalar;
3149 y[j+3UL] =
sum( xmm4 ) * scalar;
3150 y[j+4UL] =
sum( xmm5 ) * scalar;
3151 y[j+5UL] =
sum( xmm6 ) * scalar;
3152 y[j+6UL] =
sum( xmm7 ) * scalar;
3153 y[j+7UL] =
sum( xmm8 ) * scalar;
3155 for( ; remainder && i<iend; ++i ) {
3156 y[j ] += x[i] * A(i,j ) * scalar;
3157 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3158 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3159 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3160 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3161 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3162 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3163 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3167 for( ; (j+4UL) <= N; j+=4UL )
3169 const size_t ibegin( ( IsLower_v<MT1> )
3170 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3172 const size_t iend( ( IsUpper_v<MT1> )
3173 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3177 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3180 SIMDType xmm1, xmm2, xmm3, xmm4;
3184 const SIMDType x1( x.load(i) );
3185 xmm1 += x1 * A.load(i,j );
3186 xmm2 += x1 * A.load(i,j+1UL);
3187 xmm3 += x1 * A.load(i,j+2UL);
3188 xmm4 += x1 * A.load(i,j+3UL);
3191 y[j ] =
sum( xmm1 ) * scalar;
3192 y[j+1UL] =
sum( xmm2 ) * scalar;
3193 y[j+2UL] =
sum( xmm3 ) * scalar;
3194 y[j+3UL] =
sum( xmm4 ) * scalar;
3196 for( ; remainder && i<iend; ++i ) {
3197 y[j ] += x[i] * A(i,j ) * scalar;
3198 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3199 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3200 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3204 for( ; (j+3UL) <= N; j+=3UL )
3206 const size_t ibegin( ( IsLower_v<MT1> )
3207 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3209 const size_t iend( ( IsUpper_v<MT1> )
3210 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3214 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3217 SIMDType xmm1, xmm2, xmm3;
3221 const SIMDType x1( x.load(i) );
3222 xmm1 += x1 * A.load(i,j );
3223 xmm2 += x1 * A.load(i,j+1UL);
3224 xmm3 += x1 * A.load(i,j+2UL);
3227 y[j ] =
sum( xmm1 ) * scalar;
3228 y[j+1UL] =
sum( xmm2 ) * scalar;
3229 y[j+2UL] =
sum( xmm3 ) * scalar;
3231 for( ; remainder && i<iend; ++i ) {
3232 y[j ] += x[i] * A(i,j ) * scalar;
3233 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3234 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3238 for( ; (j+2UL) <= N; j+=2UL )
3240 const size_t ibegin( ( IsLower_v<MT1> )
3241 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3243 const size_t iend( ( IsUpper_v<MT1> )
3244 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3248 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3251 SIMDType xmm1, xmm2;
3255 const SIMDType x1( x.load(i) );
3256 xmm1 += x1 * A.load(i,j );
3257 xmm2 += x1 * A.load(i,j+1UL);
3260 y[j ] =
sum( xmm1 ) * scalar;
3261 y[j+1UL] =
sum( xmm2 ) * scalar;
3263 for( ; remainder && i<iend; ++i ) {
3264 y[j ] += x[i] * A(i,j ) * scalar;
3265 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3271 const size_t ibegin( ( IsLower_v<MT1> )
3272 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3274 const size_t iend( ( IsUpper_v<MT1> )
3275 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3279 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3286 xmm1 += A.load(i,j) * x.load(i);
3289 y[j] =
sum( xmm1 ) * scalar;
3291 for( ; remainder && i<iend; ++i ) {
3292 y[j] += x[i] * A(i,j) * scalar;
3312 template<
typename VT1
3316 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3317 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3319 selectDefaultAssignKernel( y, x, A, scalar );
3338 template<
typename VT1
3342 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3343 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3345 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3347 const size_t M( A.rows() );
3348 const size_t N( A.columns() );
3354 for( ; (j+8UL) <= N; j+=8UL )
3356 const size_t ibegin( ( IsLower_v<MT1> )
3357 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3359 const size_t iend( ( IsUpper_v<MT1> )
3360 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3364 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3373 const SIMDType x1( x.load(i ) );
3374 const SIMDType x2( x.load(i1) );
3375 const SIMDType x3( x.load(i2) );
3376 const SIMDType x4( x.load(i3) );
3377 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3378 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3379 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3380 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3381 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3382 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3383 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3384 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3389 const SIMDType x1( x.load(i ) );
3390 const SIMDType x2( x.load(i1) );
3391 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3392 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3393 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3394 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3395 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3396 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3397 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3398 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3402 const SIMDType x1( x.load(i) );
3403 y[j ] +=
sum( x1 * A.load(i,j ) );
3404 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3405 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3406 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3407 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3408 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3409 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3410 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3413 for( ; remainder && i<iend; ++i ) {
3414 y[j ] += x[i] * A(i,j );
3415 y[j+1UL] += x[i] * A(i,j+1UL);
3416 y[j+2UL] += x[i] * A(i,j+2UL);
3417 y[j+3UL] += x[i] * A(i,j+3UL);
3418 y[j+4UL] += x[i] * A(i,j+4UL);
3419 y[j+5UL] += x[i] * A(i,j+5UL);
3420 y[j+6UL] += x[i] * A(i,j+6UL);
3421 y[j+7UL] += x[i] * A(i,j+7UL);
3434 for( ; (j+4UL) <= N; j+=4UL )
3436 const size_t ibegin( ( IsLower_v<MT1> )
3437 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3439 const size_t iend( ( IsUpper_v<MT1> )
3440 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3444 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3453 const SIMDType x1( x.load(i ) );
3454 const SIMDType x2( x.load(i1) );
3455 const SIMDType x3( x.load(i2) );
3456 const SIMDType x4( x.load(i3) );
3457 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3459 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3460 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3465 const SIMDType x1( x.load(i ) );
3466 const SIMDType x2( x.load(i1) );
3467 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3468 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3469 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3470 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3474 const SIMDType x1( x.load(i) );
3475 y[j ] +=
sum( x1 * A.load(i,j ) );
3476 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3477 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3478 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3481 for( ; remainder && i<iend; ++i ) {
3482 y[j ] += x[i] * A(i,j );
3483 y[j+1UL] += x[i] * A(i,j+1UL);
3484 y[j+2UL] += x[i] * A(i,j+2UL);
3485 y[j+3UL] += x[i] * A(i,j+3UL);
3494 for( ; (j+2UL) <= N; j+=2UL )
3496 const size_t ibegin( ( IsLower_v<MT1> )
3497 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3499 const size_t iend( ( IsUpper_v<MT1> )
3500 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3504 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3513 const SIMDType x1( x.load(i ) );
3514 const SIMDType x2( x.load(i1) );
3515 const SIMDType x3( x.load(i2) );
3516 const SIMDType x4( x.load(i3) );
3517 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3518 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3523 const SIMDType x1( x.load(i ) );
3524 const SIMDType x2( x.load(i1) );
3525 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3526 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3530 const SIMDType x1( x.load(i) );
3531 y[j ] +=
sum( x1 * A.load(i,j ) );
3532 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3535 for( ; remainder && i<iend; ++i ) {
3536 y[j ] += x[i] * A(i,j );
3537 y[j+1UL] += x[i] * A(i,j+1UL);
3546 const size_t ibegin( ( IsLower_v<MT1> )
3547 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3549 const size_t iend( ( IsUpper_v<MT1> )
3550 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3554 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3563 const SIMDType x1( x.load(i ) );
3564 const SIMDType x2( x.load(i1) );
3565 const SIMDType x3( x.load(i2) );
3566 const SIMDType x4( x.load(i3) );
3567 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3572 const SIMDType x1( x.load(i ) );
3573 const SIMDType x2( x.load(i1) );
3574 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3578 const SIMDType x1( x.load(i) );
3579 y[j] +=
sum( x1 * A.load(i,j) );
3582 for( ; remainder && i<iend; ++i ) {
3583 y[j] += x[i] * A(i,j);
3604 template<
typename VT1
3608 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3609 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3611 selectLargeAssignKernel( y, x, A, scalar );
3616 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 3630 template<
typename VT1
3634 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3635 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
3637 using ET = ElementType_t<VT1>;
3639 if( IsTriangular_v<MT1> ) {
3640 assign( y, scalar * x );
3641 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
3644 gemv( y, x, A,
ET(scalar),
ET(0) );
3662 template<
typename VT1
3664 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3675 assign( ~lhs, tmp );
3691 template<
typename VT1
3693 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3699 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3700 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3702 if( right.rows() == 0UL || right.columns() == 0UL ) {
3714 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3729 template<
typename VT1
3733 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3735 if( ( IsDiagonal_v<MT1> ) ||
3736 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3737 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3738 selectSmallAddAssignKernel( y, x, A, scalar );
3740 selectBlasAddAssignKernel( y, x, A, scalar );
3758 template<
typename VT1
3762 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3764 y.addAssign( x * A * scalar );
3782 template<
typename VT1
3786 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3787 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3789 selectDefaultAddAssignKernel( y, x, A, scalar );
3808 template<
typename VT1
3812 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3813 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3815 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3817 const size_t M( A.rows() );
3818 const size_t N( A.columns() );
3822 for( ; (j+8UL) <= N; j+=8UL )
3824 const size_t ibegin( ( IsLower_v<MT1> )
3825 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3827 const size_t iend( ( IsUpper_v<MT1> )
3828 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3832 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3835 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3839 const SIMDType x1( x.load(i) );
3840 xmm1 += x1 * A.load(i,j );
3841 xmm2 += x1 * A.load(i,j+1UL);
3842 xmm3 += x1 * A.load(i,j+2UL);
3843 xmm4 += x1 * A.load(i,j+3UL);
3844 xmm5 += x1 * A.load(i,j+4UL);
3845 xmm6 += x1 * A.load(i,j+5UL);
3846 xmm7 += x1 * A.load(i,j+6UL);
3847 xmm8 += x1 * A.load(i,j+7UL);
3850 y[j ] +=
sum( xmm1 ) * scalar;
3851 y[j+1UL] +=
sum( xmm2 ) * scalar;
3852 y[j+2UL] +=
sum( xmm3 ) * scalar;
3853 y[j+3UL] +=
sum( xmm4 ) * scalar;
3854 y[j+4UL] +=
sum( xmm5 ) * scalar;
3855 y[j+5UL] +=
sum( xmm6 ) * scalar;
3856 y[j+6UL] +=
sum( xmm7 ) * scalar;
3857 y[j+7UL] +=
sum( xmm8 ) * scalar;
3859 for( ; remainder && i<iend; ++i ) {
3860 y[j ] += x[i] * A(i,j ) * scalar;
3861 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3862 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3863 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3864 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3865 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3866 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3867 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3871 for( ; (j+4UL) <= N; j+=4UL )
3873 const size_t ibegin( ( IsLower_v<MT1> )
3874 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3876 const size_t iend( ( IsUpper_v<MT1> )
3877 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3881 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3884 SIMDType xmm1, xmm2, xmm3, xmm4;
3888 const SIMDType x1( x.load(i) );
3889 xmm1 += x1 * A.load(i,j );
3890 xmm2 += x1 * A.load(i,j+1UL);
3891 xmm3 += x1 * A.load(i,j+2UL);
3892 xmm4 += x1 * A.load(i,j+3UL);
3895 y[j ] +=
sum( xmm1 ) * scalar;
3896 y[j+1UL] +=
sum( xmm2 ) * scalar;
3897 y[j+2UL] +=
sum( xmm3 ) * scalar;
3898 y[j+3UL] +=
sum( xmm4 ) * scalar;
3900 for( ; remainder && i<iend; ++i ) {
3901 y[j ] += x[i] * A(i,j ) * scalar;
3902 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3903 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3904 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3908 for( ; (j+3UL) <= N; j+=3UL )
3910 const size_t ibegin( ( IsLower_v<MT1> )
3911 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3913 const size_t iend( ( IsUpper_v<MT1> )
3914 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3918 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3921 SIMDType xmm1, xmm2, xmm3;
3925 const SIMDType x1( x.load(i) );
3926 xmm1 += x1 * A.load(i,j );
3927 xmm2 += x1 * A.load(i,j+1UL);
3928 xmm3 += x1 * A.load(i,j+2UL);
3931 y[j ] +=
sum( xmm1 ) * scalar;
3932 y[j+1UL] +=
sum( xmm2 ) * scalar;
3933 y[j+2UL] +=
sum( xmm3 ) * scalar;
3935 for( ; remainder && i<iend; ++i ) {
3936 y[j ] += x[i] * A(i,j ) * scalar;
3937 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3938 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3942 for( ; (j+2UL) <= N; j+=2UL )
3944 const size_t ibegin( ( IsLower_v<MT1> )
3945 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3947 const size_t iend( ( IsUpper_v<MT1> )
3948 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3952 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3955 SIMDType xmm1, xmm2;
3959 const SIMDType x1( x.load(i) );
3960 xmm1 += x1 * A.load(i,j );
3961 xmm2 += x1 * A.load(i,j+1UL);
3964 y[j ] +=
sum( xmm1 ) * scalar;
3965 y[j+1UL] +=
sum( xmm2 ) * scalar;
3967 for( ; remainder && i<iend; ++i ) {
3968 y[j ] += x[i] * A(i,j ) * scalar;
3969 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3975 const size_t ibegin( ( IsLower_v<MT1> )
3976 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
3978 const size_t iend( ( IsUpper_v<MT1> )
3979 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3983 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
3990 xmm1 += A.load(i,j) * x.load(i);
3993 y[j] +=
sum( xmm1 ) * scalar;
3995 for( ; remainder && i<iend; ++i ) {
3996 y[j] += x[i] * A(i,j) * scalar;
4016 template<
typename VT1
4020 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4021 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4023 selectDefaultAddAssignKernel( y, x, A, scalar );
4042 template<
typename VT1
4046 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4047 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4049 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4051 const size_t M( A.rows() );
4052 const size_t N( A.columns() );
4056 for( ; (j+8UL) <= N; j+=8UL )
4058 const size_t ibegin( ( IsLower_v<MT1> )
4059 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4061 const size_t iend( ( IsUpper_v<MT1> )
4062 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4066 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4075 const SIMDType x1( x.load(i ) );
4076 const SIMDType x2( x.load(i1) );
4077 const SIMDType x3( x.load(i2) );
4078 const SIMDType x4( x.load(i3) );
4079 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4080 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4081 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4082 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4083 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4084 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4085 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4086 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4091 const SIMDType x1( x.load(i ) );
4092 const SIMDType x2( x.load(i1) );
4093 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4094 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4095 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4096 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4097 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4098 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4099 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4100 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4104 const SIMDType x1( x.load(i) );
4105 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4106 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4107 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4108 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4109 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4110 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4111 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4112 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4115 for( ; remainder && i<iend; ++i ) {
4116 y[j ] += x[i] * A(i,j ) * scalar;
4117 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4118 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4119 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4120 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4121 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4122 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4123 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4127 for( ; (j+4UL) <= N; j+=4UL )
4129 const size_t ibegin( ( IsLower_v<MT1> )
4130 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4132 const size_t iend( ( IsUpper_v<MT1> )
4133 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4137 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4146 const SIMDType x1( x.load(i ) );
4147 const SIMDType x2( x.load(i1) );
4148 const SIMDType x3( x.load(i2) );
4149 const SIMDType x4( x.load(i3) );
4150 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4151 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4152 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4153 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4158 const SIMDType x1( x.load(i ) );
4159 const SIMDType x2( x.load(i1) );
4160 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4161 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4162 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4163 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4167 const SIMDType x1( x.load(i) );
4168 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4169 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4170 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4171 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4174 for( ; remainder && i<iend; ++i ) {
4175 y[j ] += x[i] * A(i,j ) * scalar;
4176 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4177 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4178 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4182 for( ; (j+2UL) <= N; j+=2UL )
4184 const size_t ibegin( ( IsLower_v<MT1> )
4185 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4187 const size_t iend( ( IsUpper_v<MT1> )
4188 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4192 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4201 const SIMDType x1( x.load(i ) );
4202 const SIMDType x2( x.load(i1) );
4203 const SIMDType x3( x.load(i2) );
4204 const SIMDType x4( x.load(i3) );
4205 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4206 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4211 const SIMDType x1( x.load(i ) );
4212 const SIMDType x2( x.load(i1) );
4213 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4214 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4218 const SIMDType x1( x.load(i) );
4219 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4220 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4223 for( ; remainder && i<iend; ++i ) {
4224 y[j ] += x[i] * A(i,j ) * scalar;
4225 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4231 const size_t ibegin( ( IsLower_v<MT1> )
4232 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4234 const size_t iend( ( IsUpper_v<MT1> )
4235 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4239 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4248 const SIMDType x1( x.load(i ) );
4249 const SIMDType x2( x.load(i1) );
4250 const SIMDType x3( x.load(i2) );
4251 const SIMDType x4( x.load(i3) );
4252 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4257 const SIMDType x1( x.load(i ) );
4258 const SIMDType x2( x.load(i1) );
4259 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4263 const SIMDType x1( x.load(i) );
4264 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4267 for( ; remainder && i<iend; ++i ) {
4268 y[j] += x[i] * A(i,j) * scalar;
4289 template<
typename VT1
4293 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4294 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4296 selectLargeAddAssignKernel( y, x, A, scalar );
4301 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4315 template<
typename VT1
4319 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4320 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4322 using ET = ElementType_t<VT1>;
4324 if( IsTriangular_v<MT1> ) {
4325 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4326 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4327 addAssign( y, tmp );
4330 gemv( y, x, A,
ET(scalar),
ET(1) );
4352 template<
typename VT1
4354 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4360 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4361 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4363 if( right.rows() == 0UL || right.columns() == 0UL ) {
4375 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4390 template<
typename VT1
4394 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4396 if( ( IsDiagonal_v<MT1> ) ||
4397 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4398 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4399 selectSmallSubAssignKernel( y, x, A, scalar );
4401 selectBlasSubAssignKernel( y, x, A, scalar );
4419 template<
typename VT1
4423 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4425 y.subAssign( x * A * scalar );
4443 template<
typename VT1
4447 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4448 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4450 selectDefaultSubAssignKernel( y, x, A, scalar );
4469 template<
typename VT1
4473 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4474 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4476 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4478 const size_t M( A.rows() );
4479 const size_t N( A.columns() );
4483 for( ; (j+8UL) <= N; j+=8UL )
4485 const size_t ibegin( ( IsLower_v<MT1> )
4486 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4488 const size_t iend( ( IsUpper_v<MT1> )
4489 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4493 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4496 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4500 const SIMDType x1( x.load(i) );
4501 xmm1 += x1 * A.load(i,j );
4502 xmm2 += x1 * A.load(i,j+1UL);
4503 xmm3 += x1 * A.load(i,j+2UL);
4504 xmm4 += x1 * A.load(i,j+3UL);
4505 xmm5 += x1 * A.load(i,j+4UL);
4506 xmm6 += x1 * A.load(i,j+5UL);
4507 xmm7 += x1 * A.load(i,j+6UL);
4508 xmm8 += x1 * A.load(i,j+7UL);
4511 y[j ] -=
sum( xmm1 ) * scalar;
4512 y[j+1UL] -=
sum( xmm2 ) * scalar;
4513 y[j+2UL] -=
sum( xmm3 ) * scalar;
4514 y[j+3UL] -=
sum( xmm4 ) * scalar;
4515 y[j+4UL] -=
sum( xmm5 ) * scalar;
4516 y[j+5UL] -=
sum( xmm6 ) * scalar;
4517 y[j+6UL] -=
sum( xmm7 ) * scalar;
4518 y[j+7UL] -=
sum( xmm8 ) * scalar;
4520 for( ; remainder && i<iend; ++i ) {
4521 y[j ] -= x[i] * A(i,j ) * scalar;
4522 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4523 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4524 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4525 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4526 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4527 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4528 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4532 for( ; (j+4UL) <= N; j+=4UL )
4534 const size_t ibegin( ( IsLower_v<MT1> )
4535 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4537 const size_t iend( ( IsUpper_v<MT1> )
4538 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4542 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4545 SIMDType xmm1, xmm2, xmm3, xmm4;
4549 const SIMDType x1( x.load(i) );
4550 xmm1 += x1 * A.load(i,j );
4551 xmm2 += x1 * A.load(i,j+1UL);
4552 xmm3 += x1 * A.load(i,j+2UL);
4553 xmm4 += x1 * A.load(i,j+3UL);
4556 y[j ] -=
sum( xmm1 ) * scalar;
4557 y[j+1UL] -=
sum( xmm2 ) * scalar;
4558 y[j+2UL] -=
sum( xmm3 ) * scalar;
4559 y[j+3UL] -=
sum( xmm4 ) * scalar;
4561 for( ; remainder && i<iend; ++i ) {
4562 y[j ] -= x[i] * A(i,j ) * scalar;
4563 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4564 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4565 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4569 for( ; (j+3UL) <= N; j+=3UL )
4571 const size_t ibegin( ( IsLower_v<MT1> )
4572 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4574 const size_t iend( ( IsUpper_v<MT1> )
4575 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
4579 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4582 SIMDType xmm1, xmm2, xmm3;
4586 const SIMDType x1( x.load(i) );
4587 xmm1 += x1 * A.load(i,j );
4588 xmm2 += x1 * A.load(i,j+1UL);
4589 xmm3 += x1 * A.load(i,j+2UL);
4592 y[j ] -=
sum( xmm1 ) * scalar;
4593 y[j+1UL] -=
sum( xmm2 ) * scalar;
4594 y[j+2UL] -=
sum( xmm3 ) * scalar;
4596 for( ; remainder && i<iend; ++i ) {
4597 y[j ] -= x[i] * A(i,j ) * scalar;
4598 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4599 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4603 for( ; (j+2UL) <= N; j+=2UL )
4605 const size_t ibegin( ( IsLower_v<MT1> )
4606 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4608 const size_t iend( ( IsUpper_v<MT1> )
4609 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4613 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4616 SIMDType xmm1, xmm2;
4620 const SIMDType x1( x.load(i) );
4621 xmm1 += x1 * A.load(i,j );
4622 xmm2 += x1 * A.load(i,j+1UL);
4625 y[j ] -=
sum( xmm1 ) * scalar;
4626 y[j+1UL] -=
sum( xmm2 ) * scalar;
4628 for( ; remainder && i<iend; ++i ) {
4629 y[j ] -= x[i] * A(i,j ) * scalar;
4630 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4636 const size_t ibegin( ( IsLower_v<MT1> )
4637 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4639 const size_t iend( ( IsUpper_v<MT1> )
4640 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4644 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4651 xmm1 += A.load(i,j) * x.load(i);
4654 y[j] -=
sum( xmm1 ) * scalar;
4656 for( ; remainder && i<iend; ++i ) {
4657 y[j] -= x[i] * A(i,j) * scalar;
4677 template<
typename VT1
4681 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4682 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4684 selectDefaultSubAssignKernel( y, x, A, scalar );
4703 template<
typename VT1
4707 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4708 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4710 constexpr
bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4712 const size_t M( A.rows() );
4713 const size_t N( A.columns() );
4717 for( ; (j+8UL) <= N; j+=8UL )
4719 const size_t ibegin( ( IsLower_v<MT1> )
4720 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4722 const size_t iend( ( IsUpper_v<MT1> )
4723 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4727 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4736 const SIMDType x1( x.load(i ) );
4737 const SIMDType x2( x.load(i1) );
4738 const SIMDType x3( x.load(i2) );
4739 const SIMDType x4( x.load(i3) );
4740 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4741 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4742 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4743 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4744 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4745 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4746 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4747 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4752 const SIMDType x1( x.load(i ) );
4753 const SIMDType x2( x.load(i1) );
4754 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4755 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4756 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4757 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4758 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4759 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4760 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4761 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4765 const SIMDType x1( x.load(i) );
4766 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4767 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4768 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4769 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4770 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4771 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4772 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4773 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4776 for( ; remainder && i<iend; ++i ) {
4777 y[j ] -= x[i] * A(i,j ) * scalar;
4778 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4779 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4780 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4781 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4782 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4783 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4784 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4788 for( ; (j+4UL) <= N; j+=4UL )
4790 const size_t ibegin( ( IsLower_v<MT1> )
4791 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4793 const size_t iend( ( IsUpper_v<MT1> )
4794 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4798 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4807 const SIMDType x1( x.load(i ) );
4808 const SIMDType x2( x.load(i1) );
4809 const SIMDType x3( x.load(i2) );
4810 const SIMDType x4( x.load(i3) );
4811 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4812 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4813 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4814 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4819 const SIMDType x1( x.load(i ) );
4820 const SIMDType x2( x.load(i1) );
4821 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4822 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4823 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4824 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4828 const SIMDType x1( x.load(i) );
4829 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4830 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4831 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4832 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4835 for( ; remainder && i<iend; ++i ) {
4836 y[j ] -= x[i] * A(i,j ) * scalar;
4837 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4838 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4839 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4843 for( ; (j+2UL) <= N; j+=2UL )
4845 const size_t ibegin( ( IsLower_v<MT1> )
4846 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4848 const size_t iend( ( IsUpper_v<MT1> )
4849 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4853 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4862 const SIMDType x1( x.load(i ) );
4863 const SIMDType x2( x.load(i1) );
4864 const SIMDType x3( x.load(i2) );
4865 const SIMDType x4( x.load(i3) );
4866 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4867 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4872 const SIMDType x1( x.load(i ) );
4873 const SIMDType x2( x.load(i1) );
4874 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4875 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4879 const SIMDType x1( x.load(i) );
4880 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4881 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4884 for( ; remainder && i<iend; ++i ) {
4885 y[j ] -= x[i] * A(i,j ) * scalar;
4886 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4892 const size_t ibegin( ( IsLower_v<MT1> )
4893 ?( ( IsStrictlyLower_v<MT1> ? j+1UL : j ) &
size_t(-
SIMDSIZE) )
4895 const size_t iend( ( IsUpper_v<MT1> )
4896 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4900 const size_t ipos( remainder ? ( iend &
size_t(-
SIMDSIZE) ) : iend );
4909 const SIMDType x1( x.load(i ) );
4910 const SIMDType x2( x.load(i1) );
4911 const SIMDType x3( x.load(i2) );
4912 const SIMDType x4( x.load(i3) );
4913 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4918 const SIMDType x1( x.load(i ) );
4919 const SIMDType x2( x.load(i1) );
4920 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4924 const SIMDType x1( x.load(i) );
4925 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4928 for( ; remainder && i<iend; ++i ) {
4929 y[j] -= x[i] * A(i,j) * scalar;
4950 template<
typename VT1
4954 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4955 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4957 selectLargeSubAssignKernel( y, x, A, scalar );
4962 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION 4976 template<
typename VT1
4980 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4981 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4983 using ET = ElementType_t<VT1>;
4985 if( IsTriangular_v<MT1> ) {
4986 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4987 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4988 subAssign( y, tmp );
4991 gemv( y, x, A,
ET(-scalar),
ET(1) );
5013 template<
typename VT1
5015 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5026 multAssign( ~lhs, tmp );
5046 template<
typename VT1
5048 friend inline void divAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5059 divAssign( ~lhs, tmp );
5081 template<
typename VT1
5084 -> EnableIf_t< UseSMPAssign_v<VT1> >
5090 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5091 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5093 if( right.rows() == 0UL ) {
5097 else if( right.columns() == 0UL ) {
5127 template<
typename VT1
5130 -> EnableIf_t< UseSMPAssign_v<VT1> >
5159 template<
typename VT1
5162 -> EnableIf_t< UseSMPAssign_v<VT1> >
5168 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5169 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5171 if( right.rows() == 0UL || right.columns() == 0UL ) {
5205 template<
typename VT1
5208 -> EnableIf_t< UseSMPAssign_v<VT1> >
5214 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5215 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5217 if( right.rows() == 0UL || right.columns() == 0UL ) {
5251 template<
typename VT1
5254 -> EnableIf_t< UseSMPAssign_v<VT1> >
5287 template<
typename VT1
5290 -> EnableIf_t< UseSMPAssign_v<VT1> >
5363 template<
typename VT
5365 inline decltype(
auto)
5372 if( (~vec).
size() != (~mat).
rows() ) {
5377 return ReturnType( ~vec, ~mat );
5392 template<
typename VT,
typename MT >
5393 struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
5394 :
public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:568
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:524
Expression object for transpose dense vector-transpose dense matrix multiplications....
Definition: Forward.h:177
Header file for auxiliary alias declarations.
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:133
Header file for the blaze::checked and blaze::unchecked instances.
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:164
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:163
Header file for basic type definitions.
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:202
typename If< Condition, T1, T2 >::Type If_t
Auxiliary alias template for the If class template.The If_t alias template provides a convenient shor...
Definition: If.h:109
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:534
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:167
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.The ResultType_t alias declaration provides ...
Definition: Aliases.h:390
Header file for the serial shim.
Header file for the IsDiagonal type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type,...
Definition: DenseMatrix.h:61
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecTDMatMultExpr.h:230
Header file for the IsSame and IsStrictlySame type traits.
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:218
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:372
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:370
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:595
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:128
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:245
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:429
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:259
constexpr Unchecked unchecked
Global Unchecked instance.The blaze::unchecked instance is an optional token for the creation of view...
Definition: Check.h:138
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_t alias declaration provid...
Definition: SIMDTrait.h:315
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:514
Header file for the DenseVector base class.
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:170
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:154
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:326
Header file for the Computation base class.
Header file for the reset shim.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:212
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes....
Definition: DenseMatrix.h:81
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.The ElementType_t alias declaration provide...
Definition: Aliases.h:170
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:220
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:434
Header file for the IsComplexDouble type trait.
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
Constraint on the data type.
DenseVector< This, TF > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:161
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:306
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:468
Header file for the DisableIf class template.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:206
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:58
Header file for the IsDouble type trait.
Header file for the If class template.
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:160
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecTDMatMultExpr.h:236
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2147
Header file for the HasSIMDAdd type trait.
Header file for all SIMD functionality.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:350
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:108
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:76
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:215
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.In case the given data type T is a matrix/matrix multiplication expressio...
Definition: MatMatMultExpr.h:83
Header file for the IsTriangular type trait.
Constraint on the data type.
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:209
Header file for the exception macros of the math module.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:316
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:126
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:173
Constraint on the data type.
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:585
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the IsPadded type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:558
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.The MultTrait_t alias declaration provid...
Definition: MultTrait.h:240
Header file for the HasConstDataAccess type trait.
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:586
System settings for the BLAS mode.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:455
Header file for the IsSIMDCombinable type trait.
Header file for the HasSIMDMult type trait.
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:136
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.The TransposeType_t alias declaration pro...
Definition: Aliases.h:470
Header file for run time assertion macros.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:578
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.The CompositeType_t alias declaration pro...
Definition: Aliases.h:90
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:293
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
Header file for the IsContiguous type trait.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Constraint on the data type.
SIMD characteristics of data types.The SIMDTrait class template provides the SIMD characteristics of ...
Definition: SIMDTrait.h:295
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:94
Header file for all forward declarations for expression class templates.
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:530
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:141
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
Header file for the TVecMatMultExpr base class.
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:162
Constraints on the storage order of matrix types.
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:808
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:93
Header file for the HasMutableDataAccess type trait.
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.In case the given data type T requires an intermediate evaluation within ...
Definition: RequiresEvaluation.h:81
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the BLAS matrix/matrix multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:152
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant alias template represents ...
Definition: IntegralConstant.h:110
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
constexpr size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:498
Header file for BLAS general matrix/vector multiplication functions (gemv)
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type,...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:104
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:546
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecTDMatMultExpr.h:223
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode....
Definition: BLAS.h:64
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:130
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:443
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:203
Header file for the IsComplexFloat type trait.
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:131
Header file for the IntegralConstant class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the complex data type.
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsUpper type trait.
Constraint on the data type.
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:204
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:338
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:205
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:360
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression,...
Definition: Assert.h:101
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:201
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:191
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the function trace functionality.
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:176
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:424