35#ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36#define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
124 :
public TVecMatMultExpr< DenseVector< TDVecTDMatMultExpr<VT,MT>, true > >
139 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
145 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
146 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
155 template<
typename T1 >
165 template<
typename T1,
typename T2,
typename T3 >
166 static constexpr bool UseBlasKernel_v =
168 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
169 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
170 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
172 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
173 IsBLASCompatible_v< ElementType_t<T1> > &&
174 IsBLASCompatible_v< ElementType_t<T2> > &&
175 IsBLASCompatible_v< ElementType_t<T3> > &&
187 template<
typename T1,
typename T2,
typename T3 >
188 static constexpr bool UseVectorizedDefaultKernel_v =
189 ( useOptimizedKernels &&
191 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
192 IsSIMDCombinable_v< ElementType_t<T1>
231 ( !IsDiagonal_v<MT> &&
232 VT::simdEnabled && MT::simdEnabled &&
233 HasSIMDAdd_v<VET,MET> &&
234 HasSIMDMult_v<VET,MET> );
269 if( IsDiagonal_v<MT> )
271 return vec_[index] *
mat_(index,index);
273 else if( IsLower_v<MT> && ( index > 8UL ) )
275 const size_t begin( IsStrictlyLower_v<MT> ? index+1UL : index );
280 else if( IsUpper_v<MT> && ( index + 8UL <
mat_.rows() ) )
282 const size_t n( IsStrictlyUpper_v<MT> ? index : index+1UL );
301 if( index >=
mat_.columns() ) {
304 return (*
this)[index];
313 inline size_t size() const noexcept {
314 return mat_.columns();
344 template<
typename T >
345 inline bool canAlias(
const T* alias )
const noexcept {
346 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
356 template<
typename T >
357 inline bool isAliased(
const T* alias )
const noexcept {
358 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
368 return vec_.isAligned() &&
mat_.isAligned();
382 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
383 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
406 template<
typename VT1 >
413 if( rhs.
mat_.rows() == 0UL ||
414 ( IsStrictlyTriangular_v<MT> && rhs.
mat_.rows() == 1UL ) ) {
418 else if( rhs.
mat_.columns() == 0UL ) {
430 TDVecTDMatMultExpr::selectAssignKernel( *lhs, x, A );
446 template<
typename VT1
449 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
451 if( ( IsDiagonal_v<MT1> ) ||
453 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
454 selectSmallAssignKernel( y, x, A );
456 selectBlasAssignKernel( y, x, A );
475 template<
typename VT1
478 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
499 template<
typename VT1
502 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
503 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
505 selectDefaultAssignKernel( y, x, A );
524 template<
typename VT1
527 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
528 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
530 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
532 const size_t M( A.rows() );
533 const size_t N( A.columns() );
537 for( ; (j+8UL) <= N; j+=8UL )
539 const size_t ibegin( ( IsLower_v<MT1> )
542 const size_t iend( ( IsUpper_v<MT1> )
543 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
556 SIMDType xmm2( x1 * A.load(i,j+1UL) );
557 SIMDType xmm3( x1 * A.load(i,j+2UL) );
558 SIMDType xmm4( x1 * A.load(i,j+3UL) );
559 SIMDType xmm5( x1 * A.load(i,j+4UL) );
560 SIMDType xmm6( x1 * A.load(i,j+5UL) );
561 SIMDType xmm7( x1 * A.load(i,j+6UL) );
562 SIMDType xmm8( x1 * A.load(i,j+7UL) );
566 xmm1 += x1 * A.load(i,j );
567 xmm2 += x1 * A.load(i,j+1UL);
568 xmm3 += x1 * A.load(i,j+2UL);
569 xmm4 += x1 * A.load(i,j+3UL);
570 xmm5 += x1 * A.load(i,j+4UL);
571 xmm6 += x1 * A.load(i,j+5UL);
572 xmm7 += x1 * A.load(i,j+6UL);
573 xmm8 += x1 * A.load(i,j+7UL);
577 y[j+1UL] =
sum( xmm2 );
578 y[j+2UL] =
sum( xmm3 );
579 y[j+3UL] =
sum( xmm4 );
580 y[j+4UL] =
sum( xmm5 );
581 y[j+5UL] =
sum( xmm6 );
582 y[j+6UL] =
sum( xmm7 );
583 y[j+7UL] =
sum( xmm8 );
585 for( ; remainder && i<iend; ++i ) {
586 y[j ] += x[i] * A(i,j );
587 y[j+1UL] += x[i] * A(i,j+1UL);
588 y[j+2UL] += x[i] * A(i,j+2UL);
589 y[j+3UL] += x[i] * A(i,j+3UL);
590 y[j+4UL] += x[i] * A(i,j+4UL);
591 y[j+5UL] += x[i] * A(i,j+5UL);
592 y[j+6UL] += x[i] * A(i,j+6UL);
593 y[j+7UL] += x[i] * A(i,j+7UL);
607 for( ++i; i<iend; ++i ) {
608 value1 += x[i] * A(i,j );
609 value2 += x[i] * A(i,j+1UL);
610 value3 += x[i] * A(i,j+2UL);
611 value4 += x[i] * A(i,j+3UL);
612 value5 += x[i] * A(i,j+4UL);
613 value6 += x[i] * A(i,j+5UL);
614 value7 += x[i] * A(i,j+6UL);
615 value8 += x[i] * A(i,j+7UL);
629 for( ; (j+4UL) <= N; j+=4UL )
631 const size_t ibegin( ( IsLower_v<MT1> )
634 const size_t iend( ( IsUpper_v<MT1> )
635 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
648 SIMDType xmm2( x1 * A.load(i,j+1UL) );
649 SIMDType xmm3( x1 * A.load(i,j+2UL) );
650 SIMDType xmm4( x1 * A.load(i,j+3UL) );
654 xmm1 += x1 * A.load(i,j );
655 xmm2 += x1 * A.load(i,j+1UL);
656 xmm3 += x1 * A.load(i,j+2UL);
657 xmm4 += x1 * A.load(i,j+3UL);
661 y[j+1UL] =
sum( xmm2 );
662 y[j+2UL] =
sum( xmm3 );
663 y[j+3UL] =
sum( xmm4 );
665 for( ; remainder && i<iend; ++i ) {
666 y[j ] += x[i] * A(i,j );
667 y[j+1UL] += x[i] * A(i,j+1UL);
668 y[j+2UL] += x[i] * A(i,j+2UL);
669 y[j+3UL] += x[i] * A(i,j+3UL);
679 for( ++i; i<iend; ++i ) {
680 value1 += x[i] * A(i,j );
681 value2 += x[i] * A(i,j+1UL);
682 value3 += x[i] * A(i,j+2UL);
683 value4 += x[i] * A(i,j+3UL);
693 for( ; (j+3UL) <= N; j+=3UL )
695 const size_t ibegin( ( IsLower_v<MT1> )
698 const size_t iend( ( IsUpper_v<MT1> )
699 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
712 SIMDType xmm2( x1 * A.load(i,j+1UL) );
713 SIMDType xmm3( x1 * A.load(i,j+2UL) );
717 xmm1 += x1 * A.load(i,j );
718 xmm2 += x1 * A.load(i,j+1UL);
719 xmm3 += x1 * A.load(i,j+2UL);
723 y[j+1UL] =
sum( xmm2 );
724 y[j+2UL] =
sum( xmm3 );
726 for( ; remainder && i<iend; ++i ) {
727 y[j ] += x[i] * A(i,j );
728 y[j+1UL] += x[i] * A(i,j+1UL);
729 y[j+2UL] += x[i] * A(i,j+2UL);
738 for( ++i; i<iend; ++i ) {
739 value1 += x[i] * A(i,j );
740 value2 += x[i] * A(i,j+1UL);
741 value3 += x[i] * A(i,j+2UL);
750 for( ; (j+2UL) <= N; j+=2UL )
752 const size_t ibegin( ( IsLower_v<MT1> )
755 const size_t iend( ( IsUpper_v<MT1> )
756 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
769 SIMDType xmm2( x1 * A.load(i,j+1UL) );
773 xmm1 += x1 * A.load(i,j );
774 xmm2 += x1 * A.load(i,j+1UL);
778 y[j+1UL] =
sum( xmm2 );
780 for( ; remainder && i<iend; ++i ) {
781 y[j ] += x[i] * A(i,j );
782 y[j+1UL] += x[i] * A(i,j+1UL);
790 for( ++i; i<iend; ++i ) {
791 value1 += x[i] * A(i,j );
792 value2 += x[i] * A(i,j+1UL);
802 const size_t ibegin( ( IsLower_v<MT1> )
805 const size_t iend( ( IsUpper_v<MT1> )
806 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
817 SIMDType xmm1( x.load(i) * A.load(i,j) );
820 xmm1 += x.load(i) * A.load(i,j);
825 for( ; remainder && i<iend; ++i ) {
826 y[j] += x[i] * A(i,j);
833 for( ++i; i<iend; ++i ) {
834 value += x[i] * A(i,j);
858 template<
typename VT1
861 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
862 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
864 selectDefaultAssignKernel( y, x, A );
883 template<
typename VT1
886 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
887 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
889 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
891 const size_t M( A.rows() );
892 const size_t N( A.columns() );
898 for( ; (j+8UL) <= N; j+=8UL )
900 const size_t ibegin( ( IsLower_v<MT1> )
903 const size_t iend( ( IsUpper_v<MT1> )
904 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
921 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
922 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
923 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
924 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
925 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
926 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
927 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
928 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
935 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
936 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
937 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
938 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
939 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
940 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
941 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
942 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
947 y[j ] +=
sum( x1 * A.load(i,j ) );
948 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
949 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
950 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
951 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
952 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
953 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
954 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
957 for( ; remainder && i<iend; ++i ) {
958 y[j ] += x[i] * A(i,j );
959 y[j+1UL] += x[i] * A(i,j+1UL);
960 y[j+2UL] += x[i] * A(i,j+2UL);
961 y[j+3UL] += x[i] * A(i,j+3UL);
962 y[j+4UL] += x[i] * A(i,j+4UL);
963 y[j+5UL] += x[i] * A(i,j+5UL);
964 y[j+6UL] += x[i] * A(i,j+6UL);
965 y[j+7UL] += x[i] * A(i,j+7UL);
969 for( ; (j+4UL) <= N; j+=4UL )
971 const size_t ibegin( ( IsLower_v<MT1> )
974 const size_t iend( ( IsUpper_v<MT1> )
975 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
992 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
993 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
994 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
995 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1002 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1003 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1004 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1005 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1010 y[j ] +=
sum( x1 * A.load(i,j ) );
1011 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1012 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1013 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1016 for( ; remainder && i<iend; ++i ) {
1017 y[j ] += x[i] * A(i,j );
1018 y[j+1UL] += x[i] * A(i,j+1UL);
1019 y[j+2UL] += x[i] * A(i,j+2UL);
1020 y[j+3UL] += x[i] * A(i,j+3UL);
1024 for( ; (j+2UL) <= N; j+=2UL )
1026 const size_t ibegin( ( IsLower_v<MT1> )
1029 const size_t iend( ( IsUpper_v<MT1> )
1030 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1047 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1048 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1055 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1056 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1061 y[j ] +=
sum( x1 * A.load(i,j ) );
1062 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1065 for( ; remainder && i<iend; ++i ) {
1066 y[j ] += x[i] * A(i,j );
1067 y[j+1UL] += x[i] * A(i,j+1UL);
1073 const size_t ibegin( ( IsLower_v<MT1> )
1076 const size_t iend( ( IsUpper_v<MT1> )
1077 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1094 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1101 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1106 y[j] +=
sum( x1 * A.load(i,j) );
1109 for( ; remainder && i<iend; ++i ) {
1110 y[j] += x[i] * A(i,j);
1131 template<
typename VT1
1134 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1135 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1137 selectLargeAssignKernel( y, x, A );
1143#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1157 template<
typename VT1
1160 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1161 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1163 using ET = ElementType_t<VT1>;
1165 if( IsTriangular_v<MT1> ) {
1167 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1170 gemv( y, x, A, ET(1), ET(0) );
1190 template<
typename VT1 >
1191 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1202 assign( *lhs, tmp );
1220 template<
typename VT1 >
1221 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1227 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
1228 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
1240 TDVecTDMatMultExpr::selectAddAssignKernel( *lhs, x, A );
1256 template<
typename VT1
1259 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1261 if( ( IsDiagonal_v<MT1> ) ||
1263 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1264 selectSmallAddAssignKernel( y, x, A );
1266 selectBlasAddAssignKernel( y, x, A );
1285 template<
typename VT1
1288 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1290 y.addAssign( x * A );
1309 template<
typename VT1
1312 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1313 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1315 selectDefaultAddAssignKernel( y, x, A );
1335 template<
typename VT1
1338 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1339 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1341 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1343 const size_t M( A.rows() );
1344 const size_t N( A.columns() );
1348 for( ; (j+8UL) <= N; j+=8UL )
1350 const size_t ibegin( ( IsLower_v<MT1> )
1353 const size_t iend( ( IsUpper_v<MT1> )
1354 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1366 SIMDType xmm1( x1 * A.load(i,j ) );
1367 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1368 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1369 SIMDType xmm4( x1 * A.load(i,j+3UL) );
1370 SIMDType xmm5( x1 * A.load(i,j+4UL) );
1371 SIMDType xmm6( x1 * A.load(i,j+5UL) );
1372 SIMDType xmm7( x1 * A.load(i,j+6UL) );
1373 SIMDType xmm8( x1 * A.load(i,j+7UL) );
1377 xmm1 += x1 * A.load(i,j );
1378 xmm2 += x1 * A.load(i,j+1UL);
1379 xmm3 += x1 * A.load(i,j+2UL);
1380 xmm4 += x1 * A.load(i,j+3UL);
1381 xmm5 += x1 * A.load(i,j+4UL);
1382 xmm6 += x1 * A.load(i,j+5UL);
1383 xmm7 += x1 * A.load(i,j+6UL);
1384 xmm8 += x1 * A.load(i,j+7UL);
1387 y[j ] +=
sum( xmm1 );
1388 y[j+1UL] +=
sum( xmm2 );
1389 y[j+2UL] +=
sum( xmm3 );
1390 y[j+3UL] +=
sum( xmm4 );
1391 y[j+4UL] +=
sum( xmm5 );
1392 y[j+5UL] +=
sum( xmm6 );
1393 y[j+6UL] +=
sum( xmm7 );
1394 y[j+7UL] +=
sum( xmm8 );
1396 for( ; remainder && i<iend; ++i ) {
1397 y[j ] += x[i] * A(i,j );
1398 y[j+1UL] += x[i] * A(i,j+1UL);
1399 y[j+2UL] += x[i] * A(i,j+2UL);
1400 y[j+3UL] += x[i] * A(i,j+3UL);
1401 y[j+4UL] += x[i] * A(i,j+4UL);
1402 y[j+5UL] += x[i] * A(i,j+5UL);
1403 y[j+6UL] += x[i] * A(i,j+6UL);
1404 y[j+7UL] += x[i] * A(i,j+7UL);
1418 for( ++i; i<iend; ++i ) {
1419 value1 += x[i] * A(i,j );
1420 value2 += x[i] * A(i,j+1UL);
1421 value3 += x[i] * A(i,j+2UL);
1422 value4 += x[i] * A(i,j+3UL);
1423 value5 += x[i] * A(i,j+4UL);
1424 value6 += x[i] * A(i,j+5UL);
1425 value7 += x[i] * A(i,j+6UL);
1426 value8 += x[i] * A(i,j+7UL);
1440 for( ; (j+4UL) <= N; j+=4UL )
1442 const size_t ibegin( ( IsLower_v<MT1> )
1445 const size_t iend( ( IsUpper_v<MT1> )
1446 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1458 SIMDType xmm1( x1 * A.load(i,j ) );
1459 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1460 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1461 SIMDType xmm4( x1 * A.load(i,j+3UL) );
1465 xmm1 += x1 * A.load(i,j );
1466 xmm2 += x1 * A.load(i,j+1UL);
1467 xmm3 += x1 * A.load(i,j+2UL);
1468 xmm4 += x1 * A.load(i,j+3UL);
1471 y[j ] +=
sum( xmm1 );
1472 y[j+1UL] +=
sum( xmm2 );
1473 y[j+2UL] +=
sum( xmm3 );
1474 y[j+3UL] +=
sum( xmm4 );
1476 for( ; remainder && i<iend; ++i ) {
1477 y[j ] += x[i] * A(i,j );
1478 y[j+1UL] += x[i] * A(i,j+1UL);
1479 y[j+2UL] += x[i] * A(i,j+2UL);
1480 y[j+3UL] += x[i] * A(i,j+3UL);
1490 for( ++i; i<iend; ++i ) {
1491 value1 += x[i] * A(i,j );
1492 value2 += x[i] * A(i,j+1UL);
1493 value3 += x[i] * A(i,j+2UL);
1494 value4 += x[i] * A(i,j+3UL);
1504 for( ; (j+3UL) <= N; j+=3UL )
1506 const size_t ibegin( ( IsLower_v<MT1> )
1509 const size_t iend( ( IsUpper_v<MT1> )
1510 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
1522 SIMDType xmm1( x1 * A.load(i,j ) );
1523 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1524 SIMDType xmm3( x1 * A.load(i,j+2UL) );
1528 xmm1 += x1 * A.load(i,j );
1529 xmm2 += x1 * A.load(i,j+1UL);
1530 xmm3 += x1 * A.load(i,j+2UL);
1533 y[j ] +=
sum( xmm1 );
1534 y[j+1UL] +=
sum( xmm2 );
1535 y[j+2UL] +=
sum( xmm3 );
1537 for( ; remainder && i<iend; ++i ) {
1538 y[j ] += x[i] * A(i,j );
1539 y[j+1UL] += x[i] * A(i,j+1UL);
1540 y[j+2UL] += x[i] * A(i,j+2UL);
1549 for( ++i; i<iend; ++i ) {
1550 value1 += x[i] * A(i,j );
1551 value2 += x[i] * A(i,j+1UL);
1552 value3 += x[i] * A(i,j+2UL);
1561 for( ; (j+2UL) <= N; j+=2UL )
1563 const size_t ibegin( ( IsLower_v<MT1> )
1566 const size_t iend( ( IsUpper_v<MT1> )
1567 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1579 SIMDType xmm1( x1 * A.load(i,j ) );
1580 SIMDType xmm2( x1 * A.load(i,j+1UL) );
1584 xmm1 += x1 * A.load(i,j );
1585 xmm2 += x1 * A.load(i,j+1UL);
1588 y[j ] +=
sum( xmm1 );
1589 y[j+1UL] +=
sum( xmm2 );
1591 for( ; remainder && i<iend; ++i ) {
1592 y[j ] += x[i] * A(i,j );
1593 y[j+1UL] += x[i] * A(i,j+1UL);
1601 for( ++i; i<iend; ++i ) {
1602 value1 += x[i] * A(i,j );
1603 value2 += x[i] * A(i,j+1UL);
1613 const size_t ibegin( ( IsLower_v<MT1> )
1616 const size_t iend( ( IsUpper_v<MT1> )
1617 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1628 SIMDType xmm1( x.load(i) * A.load(i,j) );
1631 xmm1 += A.load(i,j) * x.load(i);
1634 y[j] +=
sum( xmm1 );
1636 for( ; remainder && i<iend; ++i ) {
1637 y[j] += x[i] * A(i,j);
1644 for( ++i; i<iend; ++i ) {
1645 value += x[i] * A(i,j);
1669 template<
typename VT1
1672 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1673 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1675 selectDefaultAddAssignKernel( y, x, A );
1695 template<
typename VT1
1698 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1699 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
1701 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
1703 const size_t M( A.rows() );
1704 const size_t N( A.columns() );
1708 for( ; (j+8UL) <= N; j+=8UL )
1710 const size_t ibegin( ( IsLower_v<MT1> )
1713 const size_t iend( ( IsUpper_v<MT1> )
1714 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
1731 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1732 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1733 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1734 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1735 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1736 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1737 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1738 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1745 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1746 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1747 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1748 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1749 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1750 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1751 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1752 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1757 y[j ] +=
sum( x1 * A.load(i,j ) );
1758 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1759 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1760 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1761 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1762 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1763 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1764 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1767 for( ; remainder && i<iend; ++i ) {
1768 y[j ] += x[i] * A(i,j );
1769 y[j+1UL] += x[i] * A(i,j+1UL);
1770 y[j+2UL] += x[i] * A(i,j+2UL);
1771 y[j+3UL] += x[i] * A(i,j+3UL);
1772 y[j+4UL] += x[i] * A(i,j+4UL);
1773 y[j+5UL] += x[i] * A(i,j+5UL);
1774 y[j+6UL] += x[i] * A(i,j+6UL);
1775 y[j+7UL] += x[i] * A(i,j+7UL);
1779 for( ; (j+4UL) <= N; j+=4UL )
1781 const size_t ibegin( ( IsLower_v<MT1> )
1784 const size_t iend( ( IsUpper_v<MT1> )
1785 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
1802 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1803 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1804 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1805 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1812 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1813 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1814 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1815 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1820 y[j ] +=
sum( x1 * A.load(i,j ) );
1821 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1822 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1823 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1826 for( ; remainder && i<iend; ++i ) {
1827 y[j ] += x[i] * A(i,j );
1828 y[j+1UL] += x[i] * A(i,j+1UL);
1829 y[j+2UL] += x[i] * A(i,j+2UL);
1830 y[j+3UL] += x[i] * A(i,j+3UL);
1834 for( ; (j+2UL) <= N; j+=2UL )
1836 const size_t ibegin( ( IsLower_v<MT1> )
1839 const size_t iend( ( IsUpper_v<MT1> )
1840 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
1857 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1858 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1865 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1866 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1871 y[j ] +=
sum( x1 * A.load(i,j ) );
1872 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1875 for( ; remainder && i<iend; ++i ) {
1876 y[j ] += x[i] * A(i,j );
1877 y[j+1UL] += x[i] * A(i,j+1UL);
1883 const size_t ibegin( ( IsLower_v<MT1> )
1886 const size_t iend( ( IsUpper_v<MT1> )
1887 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
1904 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1911 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1916 y[j] +=
sum( x1 * A.load(i,j) );
1919 for( ; remainder && i<iend; ++i ) {
1920 y[j] += x[i] * A(i,j);
1941 template<
typename VT1
1944 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1945 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1947 selectLargeAddAssignKernel( y, x, A );
1953#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1967 template<
typename VT1
1970 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1971 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
1973 using ET = ElementType_t<VT1>;
1975 if( IsTriangular_v<MT1> ) {
1976 ResultType_t<VT1> tmp(
serial( x ) );
1977 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
1978 addAssign( y, tmp );
1981 gemv( y, x, A, ET(1), ET(1) );
2005 template<
typename VT1 >
2006 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2012 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2013 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2025 TDVecTDMatMultExpr::selectSubAssignKernel( *lhs, x, A );
2041 template<
typename VT1
2044 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2046 if( ( IsDiagonal_v<MT1> ) ||
2048 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
2049 selectSmallSubAssignKernel( y, x, A );
2051 selectBlasSubAssignKernel( y, x, A );
2070 template<
typename VT1
2073 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2075 y.subAssign( x * A );
2094 template<
typename VT1
2097 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2098 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2100 selectDefaultSubAssignKernel( y, x, A );
2120 template<
typename VT1
2123 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2124 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2126 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2128 const size_t M( A.rows() );
2129 const size_t N( A.columns() );
2133 for( ; (j+8UL) <= N; j+=8UL )
2135 const size_t ibegin( ( IsLower_v<MT1> )
2138 const size_t iend( ( IsUpper_v<MT1> )
2139 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2151 SIMDType xmm1( x1 * A.load(i,j ) );
2152 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2153 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2154 SIMDType xmm4( x1 * A.load(i,j+3UL) );
2155 SIMDType xmm5( x1 * A.load(i,j+4UL) );
2156 SIMDType xmm6( x1 * A.load(i,j+5UL) );
2157 SIMDType xmm7( x1 * A.load(i,j+6UL) );
2158 SIMDType xmm8( x1 * A.load(i,j+7UL) );
2162 xmm1 += x1 * A.load(i,j );
2163 xmm2 += x1 * A.load(i,j+1UL);
2164 xmm3 += x1 * A.load(i,j+2UL);
2165 xmm4 += x1 * A.load(i,j+3UL);
2166 xmm5 += x1 * A.load(i,j+4UL);
2167 xmm6 += x1 * A.load(i,j+5UL);
2168 xmm7 += x1 * A.load(i,j+6UL);
2169 xmm8 += x1 * A.load(i,j+7UL);
2172 y[j ] -=
sum( xmm1 );
2173 y[j+1UL] -=
sum( xmm2 );
2174 y[j+2UL] -=
sum( xmm3 );
2175 y[j+3UL] -=
sum( xmm4 );
2176 y[j+4UL] -=
sum( xmm5 );
2177 y[j+5UL] -=
sum( xmm6 );
2178 y[j+6UL] -=
sum( xmm7 );
2179 y[j+7UL] -=
sum( xmm8 );
2181 for( ; remainder && i<iend; ++i ) {
2182 y[j ] -= x[i] * A(i,j );
2183 y[j+1UL] -= x[i] * A(i,j+1UL);
2184 y[j+2UL] -= x[i] * A(i,j+2UL);
2185 y[j+3UL] -= x[i] * A(i,j+3UL);
2186 y[j+4UL] -= x[i] * A(i,j+4UL);
2187 y[j+5UL] -= x[i] * A(i,j+5UL);
2188 y[j+6UL] -= x[i] * A(i,j+6UL);
2189 y[j+7UL] -= x[i] * A(i,j+7UL);
2203 for( ++i; i<iend; ++i ) {
2204 value1 += x[i] * A(i,j );
2205 value2 += x[i] * A(i,j+1UL);
2206 value3 += x[i] * A(i,j+2UL);
2207 value4 += x[i] * A(i,j+3UL);
2208 value5 += x[i] * A(i,j+4UL);
2209 value6 += x[i] * A(i,j+5UL);
2210 value7 += x[i] * A(i,j+6UL);
2211 value8 += x[i] * A(i,j+7UL);
2225 for( ; (j+4UL) <= N; j+=4UL )
2227 const size_t ibegin( ( IsLower_v<MT1> )
2230 const size_t iend( ( IsUpper_v<MT1> )
2231 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2243 SIMDType xmm1( x1 * A.load(i,j ) );
2244 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2245 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2246 SIMDType xmm4( x1 * A.load(i,j+3UL) );
2250 xmm1 += x1 * A.load(i,j );
2251 xmm2 += x1 * A.load(i,j+1UL);
2252 xmm3 += x1 * A.load(i,j+2UL);
2253 xmm4 += x1 * A.load(i,j+3UL);
2256 y[j ] -=
sum( xmm1 );
2257 y[j+1UL] -=
sum( xmm2 );
2258 y[j+2UL] -=
sum( xmm3 );
2259 y[j+3UL] -=
sum( xmm4 );
2261 for( ; remainder && i<iend; ++i ) {
2262 y[j ] -= x[i] * A(i,j );
2263 y[j+1UL] -= x[i] * A(i,j+1UL);
2264 y[j+2UL] -= x[i] * A(i,j+2UL);
2265 y[j+3UL] -= x[i] * A(i,j+3UL);
2275 for( ++i; i<iend; ++i ) {
2276 value1 += x[i] * A(i,j );
2277 value2 += x[i] * A(i,j+1UL);
2278 value3 += x[i] * A(i,j+2UL);
2279 value4 += x[i] * A(i,j+3UL);
2289 for( ; (j+3UL) <= N; j+=3UL )
2291 const size_t ibegin( ( IsLower_v<MT1> )
2294 const size_t iend( ( IsUpper_v<MT1> )
2295 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
2307 SIMDType xmm1( x1 * A.load(i,j ) );
2308 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2309 SIMDType xmm3( x1 * A.load(i,j+2UL) );
2313 xmm1 += x1 * A.load(i,j );
2314 xmm2 += x1 * A.load(i,j+1UL);
2315 xmm3 += x1 * A.load(i,j+2UL);
2318 y[j ] -=
sum( xmm1 );
2319 y[j+1UL] -=
sum( xmm2 );
2320 y[j+2UL] -=
sum( xmm3 );
2322 for( ; remainder && i<iend; ++i ) {
2323 y[j ] -= x[i] * A(i,j );
2324 y[j+1UL] -= x[i] * A(i,j+1UL);
2325 y[j+2UL] -= x[i] * A(i,j+2UL);
2334 for( ++i; i<iend; ++i ) {
2335 value1 += x[i] * A(i,j );
2336 value2 += x[i] * A(i,j+1UL);
2337 value3 += x[i] * A(i,j+2UL);
2346 for( ; (j+2UL) <= N; j+=2UL )
2348 const size_t ibegin( ( IsLower_v<MT1> )
2351 const size_t iend( ( IsUpper_v<MT1> )
2352 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2364 SIMDType xmm1( x1 * A.load(i,j ) );
2365 SIMDType xmm2( x1 * A.load(i,j+1UL) );
2369 xmm1 += x1 * A.load(i,j );
2370 xmm2 += x1 * A.load(i,j+1UL);
2373 y[j ] -=
sum( xmm1 );
2374 y[j+1UL] -=
sum( xmm2 );
2376 for( ; remainder && i<iend; ++i ) {
2377 y[j ] -= x[i] * A(i,j );
2378 y[j+1UL] -= x[i] * A(i,j+1UL);
2386 for( ++i; i<iend; ++i ) {
2387 value1 += x[i] * A(i,j );
2388 value2 += x[i] * A(i,j+1UL);
2398 const size_t ibegin( ( IsLower_v<MT1> )
2401 const size_t iend( ( IsUpper_v<MT1> )
2402 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2413 SIMDType xmm1( x.load(i) * A.load(i,j) );
2416 xmm1 += A.load(i,j) * x.load(i);
2419 y[j] -=
sum( xmm1 );
2421 for( ; remainder && i<iend; ++i ) {
2422 y[j] -= x[i] * A(i,j);
2429 for( ++i; i<iend; ++i ) {
2430 value += x[i] * A(i,j);
2454 template<
typename VT1
2457 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2458 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2460 selectDefaultSubAssignKernel( y, x, A );
2480 template<
typename VT1
2483 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2484 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1> >
2486 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
2488 const size_t M( A.rows() );
2489 const size_t N( A.columns() );
2493 for( ; (j+8UL) <= N; j+=8UL )
2495 const size_t ibegin( ( IsLower_v<MT1> )
2498 const size_t iend( ( IsUpper_v<MT1> )
2499 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
2516 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2517 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2518 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2519 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2520 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2521 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2522 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2523 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2530 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2531 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2532 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2533 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2534 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2535 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2536 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2537 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2542 y[j ] -=
sum( x1 * A.load(i,j ) );
2543 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2544 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2545 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2546 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2547 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2548 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2549 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2552 for( ; remainder && i<iend; ++i ) {
2553 y[j ] -= x[i] * A(i,j );
2554 y[j+1UL] -= x[i] * A(i,j+1UL);
2555 y[j+2UL] -= x[i] * A(i,j+2UL);
2556 y[j+3UL] -= x[i] * A(i,j+3UL);
2557 y[j+4UL] -= x[i] * A(i,j+4UL);
2558 y[j+5UL] -= x[i] * A(i,j+5UL);
2559 y[j+6UL] -= x[i] * A(i,j+6UL);
2560 y[j+7UL] -= x[i] * A(i,j+7UL);
2564 for( ; (j+4UL) <= N; j+=4UL )
2566 const size_t ibegin( ( IsLower_v<MT1> )
2569 const size_t iend( ( IsUpper_v<MT1> )
2570 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
2587 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2588 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2589 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2590 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2597 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2598 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2599 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2600 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2605 y[j ] -=
sum( x1 * A.load(i,j ) );
2606 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2607 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2608 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2611 for( ; remainder && i<iend; ++i ) {
2612 y[j ] -= x[i] * A(i,j );
2613 y[j+1UL] -= x[i] * A(i,j+1UL);
2614 y[j+2UL] -= x[i] * A(i,j+2UL);
2615 y[j+3UL] -= x[i] * A(i,j+3UL);
2619 for( ; (j+2UL) <= N; j+=2UL )
2621 const size_t ibegin( ( IsLower_v<MT1> )
2624 const size_t iend( ( IsUpper_v<MT1> )
2625 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
2642 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2643 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2650 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2651 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2656 y[j ] -=
sum( x1 * A.load(i,j ) );
2657 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2660 for( ; remainder && i<iend; ++i ) {
2661 y[j ] -= x[i] * A(i,j );
2662 y[j+1UL] -= x[i] * A(i,j+1UL);
2668 const size_t ibegin( ( IsLower_v<MT1> )
2671 const size_t iend( ( IsUpper_v<MT1> )
2672 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
2689 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2696 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2701 y[j] -=
sum( x1 * A.load(i,j) );
2704 for( ; remainder && i<iend; ++i ) {
2705 y[j] -= x[i] * A(i,j);
2726 template<
typename VT1
2729 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2730 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2732 selectLargeSubAssignKernel( y, x, A );
2738#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2752 template<
typename VT1
2755 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2756 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1> >
2758 using ET = ElementType_t<VT1>;
2760 if( IsTriangular_v<MT1> ) {
2761 ResultType_t<VT1> tmp(
serial( x ) );
2762 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
2763 subAssign( y, tmp );
2766 gemv( y, x, A, ET(-1), ET(1) );
2790 template<
typename VT1 >
2791 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2802 multAssign( *lhs, tmp );
2824 template<
typename VT1 >
2825 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2836 divAssign( *lhs, tmp );
2860 template<
typename VT1 >
2862 -> EnableIf_t< UseSMPAssign_v<VT1> >
2868 if( rhs.mat_.rows() == 0UL ||
2869 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2873 else if( rhs.mat_.columns() == 0UL ) {
2905 template<
typename VT1 >
2907 -> EnableIf_t< UseSMPAssign_v<VT1> >
2938 template<
typename VT1 >
2940 -> EnableIf_t< UseSMPAssign_v<VT1> >
2946 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2947 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
2983 template<
typename VT1 >
2985 -> EnableIf_t< UseSMPAssign_v<VT1> >
2991 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ||
2992 ( IsStrictlyTriangular_v<MT> && rhs.mat_.rows() == 1UL ) ) {
3028 template<
typename VT1 >
3030 -> EnableIf_t< UseSMPAssign_v<VT1> >
3065 template<
typename VT1 >
3067 -> EnableIf_t< UseSMPAssign_v<VT1> >
3116template<
typename VT
3119class DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >
3120 :
public VecScalarMultExpr< DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true > >
3121 ,
private Computation
3125 using VMM = TDVecTDMatMultExpr<VT,MT>;
3126 using RES = ResultType_t<VMM>;
3127 using VRT = ResultType_t<VT>;
3128 using MRT = ResultType_t<MT>;
3129 using VET = ElementType_t<VRT>;
3130 using MET = ElementType_t<MRT>;
3131 using VCT = CompositeType_t<VT>;
3132 using MCT = CompositeType_t<MT>;
3137 static constexpr bool evaluateVector = ( IsComputation_v<VT> || RequiresEvaluation_v<VT> );
3142 static constexpr bool evaluateMatrix =
3143 ( ( IsComputation_v<MT> && IsSame_v<MET,VET> &&
3144 IsBLASCompatible_v<MET> ) || RequiresEvaluation_v<MT> );
3151 template<
typename T1 >
3152 static constexpr bool UseSMPAssign_v =
3153 ( T1::smpAssignable && ( evaluateVector || evaluateMatrix ) );
3160 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3161 static constexpr bool UseBlasKernel_v =
3163 IsContiguous_v<T1> && HasMutableDataAccess_v<T1> &&
3164 IsContiguous_v<T2> && HasConstDataAccess_v<T2> &&
3165 IsContiguous_v<T3> && HasConstDataAccess_v<T3> &&
3166 !IsDiagonal_v<T3> &&
3167 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3168 IsBLASCompatible_v< ElementType_t<T1> > &&
3169 IsBLASCompatible_v< ElementType_t<T2> > &&
3170 IsBLASCompatible_v< ElementType_t<T3> > &&
3171 IsSame_v< ElementType_t<T1>, ElementType_t<T2> > &&
3172 IsSame_v< ElementType_t<T1>, ElementType_t<T3> > &&
3173 !( IsBuiltin_v< ElementType_t<T1> > && IsComplex_v<T4> ) );
3181 template<
typename T1,
typename T2,
typename T3,
typename T4 >
3182 static constexpr bool UseVectorizedDefaultKernel_v =
3183 ( useOptimizedKernels &&
3184 !IsDiagonal_v<T3> &&
3185 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
3186 IsSIMDCombinable_v< ElementType_t<T1>
3190 HasSIMDAdd_v< ElementType_t<T2>, ElementType_t<T3> > &&
3191 HasSIMDMult_v< ElementType_t<T2>, ElementType_t<T3> > );
3197 using This = DVecScalarMultExpr<VMM,ST,true>;
3200 using BaseType = VecScalarMultExpr< DenseVector<This,true> >;
3205 using SIMDType = SIMDTrait_t<ElementType>;
3210 using LeftOperand =
const TDVecTDMatMultExpr<VT,MT>;
3216 using LT = If_t< evaluateVector, const VRT, VCT >;
3219 using RT = If_t< evaluateMatrix, const MRT, MCT >;
3225 ( !IsDiagonal_v<MT> &&
3226 VT::simdEnabled && MT::simdEnabled &&
3227 IsSIMDCombinable_v<VET,MET,ST> &&
3228 HasSIMDAdd_v<VET,MET> &&
3229 HasSIMDMult_v<VET,MET> );
3233 ( !evaluateVector && VT::smpAssignable && !evaluateMatrix && MT::smpAssignable );
3273 if( index >=
vector_.size() ) {
3276 return (*
this)[index];
3285 inline size_t size()
const {
3316 template<
typename T >
3317 inline bool canAlias(
const T* alias )
const {
3318 return vector_.canAlias( alias );
3328 template<
typename T >
3329 inline bool isAliased(
const T* alias )
const {
3330 return vector_.isAliased( alias );
3350 RightOperand_t<VMM> A(
vector_.rightOperand() );
3354 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3355 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
3356 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
3378 template<
typename VT1
3386 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
3387 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
3389 if( right.rows() == 0UL ||
3390 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
3394 else if( right.columns() == 0UL ) {
3406 DVecScalarMultExpr::selectAssignKernel( *lhs, x, A, rhs.scalar_ );
3421 template<
typename VT1
3425 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3427 if( ( IsDiagonal_v<MT1> ) ||
3428 ( IsComputation_v<MT> && !evaluateMatrix ) ||
3429 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
3430 selectSmallAssignKernel( y, x, A, scalar );
3432 selectBlasAssignKernel( y, x, A, scalar );
3450 template<
typename VT1
3454 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3456 y.assign( x * A * scalar );
3474 template<
typename VT1
3478 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3479 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3481 selectDefaultAssignKernel( y, x, A, scalar );
3500 template<
typename VT1
3504 static inline auto selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3505 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3507 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3509 const size_t M( A.rows() );
3510 const size_t N( A.columns() );
3514 for( ; (j+8UL) <= N; j+=8UL )
3516 const size_t ibegin( ( IsLower_v<MT1> )
3519 const size_t iend( ( IsUpper_v<MT1> )
3520 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3531 SIMDType x1( x.load(i) );
3532 SIMDType xmm1( x1 * A.load(i,j ) );
3533 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3534 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3535 SIMDType xmm4( x1 * A.load(i,j+3UL) );
3536 SIMDType xmm5( x1 * A.load(i,j+4UL) );
3537 SIMDType xmm6( x1 * A.load(i,j+5UL) );
3538 SIMDType xmm7( x1 * A.load(i,j+6UL) );
3539 SIMDType xmm8( x1 * A.load(i,j+7UL) );
3543 xmm1 += x1 * A.load(i,j );
3544 xmm2 += x1 * A.load(i,j+1UL);
3545 xmm3 += x1 * A.load(i,j+2UL);
3546 xmm4 += x1 * A.load(i,j+3UL);
3547 xmm5 += x1 * A.load(i,j+4UL);
3548 xmm6 += x1 * A.load(i,j+5UL);
3549 xmm7 += x1 * A.load(i,j+6UL);
3550 xmm8 += x1 * A.load(i,j+7UL);
3553 y[j ] =
sum( xmm1 ) * scalar;
3554 y[j+1UL] =
sum( xmm2 ) * scalar;
3555 y[j+2UL] =
sum( xmm3 ) * scalar;
3556 y[j+3UL] =
sum( xmm4 ) * scalar;
3557 y[j+4UL] =
sum( xmm5 ) * scalar;
3558 y[j+5UL] =
sum( xmm6 ) * scalar;
3559 y[j+6UL] =
sum( xmm7 ) * scalar;
3560 y[j+7UL] =
sum( xmm8 ) * scalar;
3562 for( ; remainder && i<iend; ++i ) {
3563 y[j ] += x[i] * A(i,j ) * scalar;
3564 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3565 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3566 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3567 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3568 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3569 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3570 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3584 for( ++i; i<iend; ++i ) {
3585 value1 += x[i] * A(i,j );
3586 value2 += x[i] * A(i,j+1UL);
3587 value3 += x[i] * A(i,j+2UL);
3588 value4 += x[i] * A(i,j+3UL);
3589 value5 += x[i] * A(i,j+4UL);
3590 value6 += x[i] * A(i,j+5UL);
3591 value7 += x[i] * A(i,j+6UL);
3592 value8 += x[i] * A(i,j+7UL);
3595 y[j ] = value1 * scalar;
3596 y[j+1UL] = value2 * scalar;
3597 y[j+2UL] = value3 * scalar;
3598 y[j+3UL] = value4 * scalar;
3599 y[j+4UL] = value5 * scalar;
3600 y[j+5UL] = value6 * scalar;
3601 y[j+6UL] = value7 * scalar;
3602 y[j+7UL] = value8 * scalar;
3606 for( ; (j+4UL) <= N; j+=4UL )
3608 const size_t ibegin( ( IsLower_v<MT1> )
3611 const size_t iend( ( IsUpper_v<MT1> )
3612 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3623 SIMDType x1( x.load(i) );
3624 SIMDType xmm1( x1 * A.load(i,j ) );
3625 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3626 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3627 SIMDType xmm4( x1 * A.load(i,j+3UL) );
3631 xmm1 += x1 * A.load(i,j );
3632 xmm2 += x1 * A.load(i,j+1UL);
3633 xmm3 += x1 * A.load(i,j+2UL);
3634 xmm4 += x1 * A.load(i,j+3UL);
3637 y[j ] =
sum( xmm1 ) * scalar;
3638 y[j+1UL] =
sum( xmm2 ) * scalar;
3639 y[j+2UL] =
sum( xmm3 ) * scalar;
3640 y[j+3UL] =
sum( xmm4 ) * scalar;
3642 for( ; remainder && i<iend; ++i ) {
3643 y[j ] += x[i] * A(i,j ) * scalar;
3644 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3645 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3646 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3656 for( ++i; i<iend; ++i ) {
3657 value1 += x[i] * A(i,j );
3658 value2 += x[i] * A(i,j+1UL);
3659 value3 += x[i] * A(i,j+2UL);
3660 value4 += x[i] * A(i,j+3UL);
3663 y[j ] = value1 * scalar;
3664 y[j+1UL] = value2 * scalar;
3665 y[j+2UL] = value3 * scalar;
3666 y[j+3UL] = value4 * scalar;
3670 for( ; (j+3UL) <= N; j+=3UL )
3672 const size_t ibegin( ( IsLower_v<MT1> )
3675 const size_t iend( ( IsUpper_v<MT1> )
3676 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
3687 SIMDType x1( x.load(i) );
3688 SIMDType xmm1( x1 * A.load(i,j ) );
3689 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3690 SIMDType xmm3( x1 * A.load(i,j+2UL) );
3694 xmm1 += x1 * A.load(i,j );
3695 xmm2 += x1 * A.load(i,j+1UL);
3696 xmm3 += x1 * A.load(i,j+2UL);
3699 y[j ] =
sum( xmm1 ) * scalar;
3700 y[j+1UL] =
sum( xmm2 ) * scalar;
3701 y[j+2UL] =
sum( xmm3 ) * scalar;
3703 for( ; remainder && i<iend; ++i ) {
3704 y[j ] += x[i] * A(i,j ) * scalar;
3705 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3706 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3715 for( ++i; i<iend; ++i ) {
3716 value1 += x[i] * A(i,j );
3717 value2 += x[i] * A(i,j+1UL);
3718 value3 += x[i] * A(i,j+2UL);
3721 y[j ] = value1 * scalar;
3722 y[j+1UL] = value2 * scalar;
3723 y[j+2UL] = value3 * scalar;
3727 for( ; (j+2UL) <= N; j+=2UL )
3729 const size_t ibegin( ( IsLower_v<MT1> )
3732 const size_t iend( ( IsUpper_v<MT1> )
3733 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
3744 SIMDType x1( x.load(i) );
3745 SIMDType xmm1( x1 * A.load(i,j ) );
3746 SIMDType xmm2( x1 * A.load(i,j+1UL) );
3750 xmm1 += x1 * A.load(i,j );
3751 xmm2 += x1 * A.load(i,j+1UL);
3754 y[j ] =
sum( xmm1 ) * scalar;
3755 y[j+1UL] =
sum( xmm2 ) * scalar;
3757 for( ; remainder && i<iend; ++i ) {
3758 y[j ] += x[i] * A(i,j ) * scalar;
3759 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3767 for( ++i; i<iend; ++i ) {
3768 value1 += x[i] * A(i,j );
3769 value2 += x[i] * A(i,j+1UL);
3772 y[j ] = value1 * scalar;
3773 y[j+1UL] = value2 * scalar;
3779 const size_t ibegin( ( IsLower_v<MT1> )
3782 const size_t iend( ( IsUpper_v<MT1> )
3783 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
3794 SIMDType xmm1( x.load(i) * A.load(i,j ) );
3797 xmm1 += A.load(i,j) * x.load(i);
3800 y[j] =
sum( xmm1 ) * scalar;
3802 for( ; remainder && i<iend; ++i ) {
3803 y[j] += x[i] * A(i,j) * scalar;
3810 for( ++i; i<iend; ++i ) {
3811 value += x[i] * A(i,j);
3814 y[j] = value * scalar;
3834 template<
typename VT1
3838 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3839 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3841 selectDefaultAssignKernel( y, x, A, scalar );
3860 template<
typename VT1
3864 static inline auto selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3865 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
3867 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
3869 const size_t M( A.rows() );
3870 const size_t N( A.columns() );
3876 for( ; (j+8UL) <= N; j+=8UL )
3878 const size_t ibegin( ( IsLower_v<MT1> )
3881 const size_t iend( ( IsUpper_v<MT1> )
3882 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
3895 const SIMDType x1( x.load(i ) );
3896 const SIMDType x2( x.load(i1) );
3897 const SIMDType x3( x.load(i2) );
3898 const SIMDType x4( x.load(i3) );
3899 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3900 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3901 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3902 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3903 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3904 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3905 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3906 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3911 const SIMDType x1( x.load(i ) );
3912 const SIMDType x2( x.load(i1) );
3913 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3914 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3915 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3916 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3917 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3918 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3919 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3920 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3924 const SIMDType x1( x.load(i) );
3925 y[j ] +=
sum( x1 * A.load(i,j ) );
3926 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3927 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3928 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3929 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3930 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3931 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3932 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3935 for( ; remainder && i<iend; ++i ) {
3936 y[j ] += x[i] * A(i,j );
3937 y[j+1UL] += x[i] * A(i,j+1UL);
3938 y[j+2UL] += x[i] * A(i,j+2UL);
3939 y[j+3UL] += x[i] * A(i,j+3UL);
3940 y[j+4UL] += x[i] * A(i,j+4UL);
3941 y[j+5UL] += x[i] * A(i,j+5UL);
3942 y[j+6UL] += x[i] * A(i,j+6UL);
3943 y[j+7UL] += x[i] * A(i,j+7UL);
3956 for( ; (j+4UL) <= N; j+=4UL )
3958 const size_t ibegin( ( IsLower_v<MT1> )
3961 const size_t iend( ( IsUpper_v<MT1> )
3962 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
3975 const SIMDType x1( x.load(i ) );
3976 const SIMDType x2( x.load(i1) );
3977 const SIMDType x3( x.load(i2) );
3978 const SIMDType x4( x.load(i3) );
3979 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3980 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3981 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3982 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3987 const SIMDType x1( x.load(i ) );
3988 const SIMDType x2( x.load(i1) );
3989 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3990 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3991 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3992 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3996 const SIMDType x1( x.load(i) );
3997 y[j ] +=
sum( x1 * A.load(i,j ) );
3998 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3999 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
4000 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
4003 for( ; remainder && i<iend; ++i ) {
4004 y[j ] += x[i] * A(i,j );
4005 y[j+1UL] += x[i] * A(i,j+1UL);
4006 y[j+2UL] += x[i] * A(i,j+2UL);
4007 y[j+3UL] += x[i] * A(i,j+3UL);
4016 for( ; (j+2UL) <= N; j+=2UL )
4018 const size_t ibegin( ( IsLower_v<MT1> )
4021 const size_t iend( ( IsUpper_v<MT1> )
4022 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4035 const SIMDType x1( x.load(i ) );
4036 const SIMDType x2( x.load(i1) );
4037 const SIMDType x3( x.load(i2) );
4038 const SIMDType x4( x.load(i3) );
4039 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
4040 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
4045 const SIMDType x1( x.load(i ) );
4046 const SIMDType x2( x.load(i1) );
4047 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
4048 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
4052 const SIMDType x1( x.load(i) );
4053 y[j ] +=
sum( x1 * A.load(i,j ) );
4054 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
4057 for( ; remainder && i<iend; ++i ) {
4058 y[j ] += x[i] * A(i,j );
4059 y[j+1UL] += x[i] * A(i,j+1UL);
4068 const size_t ibegin( ( IsLower_v<MT1> )
4071 const size_t iend( ( IsUpper_v<MT1> )
4072 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4085 const SIMDType x1( x.load(i ) );
4086 const SIMDType x2( x.load(i1) );
4087 const SIMDType x3( x.load(i2) );
4088 const SIMDType x4( x.load(i3) );
4089 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
4094 const SIMDType x1( x.load(i ) );
4095 const SIMDType x2( x.load(i1) );
4096 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
4100 const SIMDType x1( x.load(i) );
4101 y[j] +=
sum( x1 * A.load(i,j) );
4104 for( ; remainder && i<iend; ++i ) {
4105 y[j] += x[i] * A(i,j);
4126 template<
typename VT1
4130 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4131 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4133 selectLargeAssignKernel( y, x, A, scalar );
4138#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4152 template<
typename VT1
4156 static inline auto selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4157 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4159 using ET = ElementType_t<VT1>;
4161 if( IsTriangular_v<MT1> ) {
4162 assign( y, scalar * x );
4163 trmv( y, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4166 gemv( y, x, A,
ET(scalar),
ET(0) );
4184 template<
typename VT1
4186 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4197 assign( *lhs, tmp );
4213 template<
typename VT1
4215 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4221 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
4222 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
4224 if( right.rows() == 0UL || right.columns() == 0UL ||
4225 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
4237 DVecScalarMultExpr::selectAddAssignKernel( *lhs, x, A, rhs.scalar_ );
4252 template<
typename VT1
4256 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4258 if( ( IsDiagonal_v<MT1> ) ||
4259 ( IsComputation_v<MT> && !evaluateMatrix ) ||
4260 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4261 selectSmallAddAssignKernel( y, x, A, scalar );
4263 selectBlasAddAssignKernel( y, x, A, scalar );
4281 template<
typename VT1
4285 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4287 y.addAssign( x * A * scalar );
4305 template<
typename VT1
4309 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4310 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4312 selectDefaultAddAssignKernel( y, x, A, scalar );
4331 template<
typename VT1
4335 static inline auto selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4336 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4338 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4340 const size_t M( A.rows() );
4341 const size_t N( A.columns() );
4345 for( ; (j+8UL) <= N; j+=8UL )
4347 const size_t ibegin( ( IsLower_v<MT1> )
4350 const size_t iend( ( IsUpper_v<MT1> )
4351 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4362 SIMDType x1( x.load(i) );
4363 SIMDType xmm1( x1 * A.load(i,j ) );
4364 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4365 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4366 SIMDType xmm4( x1 * A.load(i,j+3UL) );
4367 SIMDType xmm5( x1 * A.load(i,j+4UL) );
4368 SIMDType xmm6( x1 * A.load(i,j+5UL) );
4369 SIMDType xmm7( x1 * A.load(i,j+6UL) );
4370 SIMDType xmm8( x1 * A.load(i,j+7UL) );
4374 xmm1 += x1 * A.load(i,j );
4375 xmm2 += x1 * A.load(i,j+1UL);
4376 xmm3 += x1 * A.load(i,j+2UL);
4377 xmm4 += x1 * A.load(i,j+3UL);
4378 xmm5 += x1 * A.load(i,j+4UL);
4379 xmm6 += x1 * A.load(i,j+5UL);
4380 xmm7 += x1 * A.load(i,j+6UL);
4381 xmm8 += x1 * A.load(i,j+7UL);
4384 y[j ] +=
sum( xmm1 ) * scalar;
4385 y[j+1UL] +=
sum( xmm2 ) * scalar;
4386 y[j+2UL] +=
sum( xmm3 ) * scalar;
4387 y[j+3UL] +=
sum( xmm4 ) * scalar;
4388 y[j+4UL] +=
sum( xmm5 ) * scalar;
4389 y[j+5UL] +=
sum( xmm6 ) * scalar;
4390 y[j+6UL] +=
sum( xmm7 ) * scalar;
4391 y[j+7UL] +=
sum( xmm8 ) * scalar;
4393 for( ; remainder && i<iend; ++i ) {
4394 y[j ] += x[i] * A(i,j ) * scalar;
4395 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4396 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4397 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4398 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4399 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4400 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4401 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4415 for( ++i; i<iend; ++i ) {
4416 value1 += x[i] * A(i,j );
4417 value2 += x[i] * A(i,j+1UL);
4418 value3 += x[i] * A(i,j+2UL);
4419 value4 += x[i] * A(i,j+3UL);
4420 value5 += x[i] * A(i,j+4UL);
4421 value6 += x[i] * A(i,j+5UL);
4422 value7 += x[i] * A(i,j+6UL);
4423 value8 += x[i] * A(i,j+7UL);
4426 y[j ] += value1 * scalar;
4427 y[j+1UL] += value2 * scalar;
4428 y[j+2UL] += value3 * scalar;
4429 y[j+3UL] += value4 * scalar;
4430 y[j+4UL] += value5 * scalar;
4431 y[j+5UL] += value6 * scalar;
4432 y[j+6UL] += value7 * scalar;
4433 y[j+7UL] += value8 * scalar;
4437 for( ; (j+4UL) <= N; j+=4UL )
4439 const size_t ibegin( ( IsLower_v<MT1> )
4442 const size_t iend( ( IsUpper_v<MT1> )
4443 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4454 SIMDType x1( x.load(i) );
4455 SIMDType xmm1( x1 * A.load(i,j ) );
4456 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4457 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4458 SIMDType xmm4( x1 * A.load(i,j+3UL) );
4462 xmm1 += x1 * A.load(i,j );
4463 xmm2 += x1 * A.load(i,j+1UL);
4464 xmm3 += x1 * A.load(i,j+2UL);
4465 xmm4 += x1 * A.load(i,j+3UL);
4468 y[j ] +=
sum( xmm1 ) * scalar;
4469 y[j+1UL] +=
sum( xmm2 ) * scalar;
4470 y[j+2UL] +=
sum( xmm3 ) * scalar;
4471 y[j+3UL] +=
sum( xmm4 ) * scalar;
4473 for( ; remainder && i<iend; ++i ) {
4474 y[j ] += x[i] * A(i,j ) * scalar;
4475 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4476 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4477 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4487 for( ++i; i<iend; ++i ) {
4488 value1 += x[i] * A(i,j );
4489 value2 += x[i] * A(i,j+1UL);
4490 value3 += x[i] * A(i,j+2UL);
4491 value4 += x[i] * A(i,j+3UL);
4494 y[j ] += value1 * scalar;
4495 y[j+1UL] += value2 * scalar;
4496 y[j+2UL] += value3 * scalar;
4497 y[j+3UL] += value4 * scalar;
4501 for( ; (j+3UL) <= N; j+=3UL )
4503 const size_t ibegin( ( IsLower_v<MT1> )
4506 const size_t iend( ( IsUpper_v<MT1> )
4507 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
4518 SIMDType x1( x.load(i) );
4519 SIMDType xmm1( x1 * A.load(i,j ) );
4520 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4521 SIMDType xmm3( x1 * A.load(i,j+2UL) );
4525 xmm1 += x1 * A.load(i,j );
4526 xmm2 += x1 * A.load(i,j+1UL);
4527 xmm3 += x1 * A.load(i,j+2UL);
4530 y[j ] +=
sum( xmm1 ) * scalar;
4531 y[j+1UL] +=
sum( xmm2 ) * scalar;
4532 y[j+2UL] +=
sum( xmm3 ) * scalar;
4534 for( ; remainder && i<iend; ++i ) {
4535 y[j ] += x[i] * A(i,j ) * scalar;
4536 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4537 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4546 for( ++i; i<iend; ++i ) {
4547 value1 += x[i] * A(i,j );
4548 value2 += x[i] * A(i,j+1UL);
4549 value3 += x[i] * A(i,j+2UL);
4552 y[j ] += value1 * scalar;
4553 y[j+1UL] += value2 * scalar;
4554 y[j+2UL] += value3 * scalar;
4558 for( ; (j+2UL) <= N; j+=2UL )
4560 const size_t ibegin( ( IsLower_v<MT1> )
4563 const size_t iend( ( IsUpper_v<MT1> )
4564 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4575 SIMDType x1( x.load(i) );
4576 SIMDType xmm1( x1 * A.load(i,j ) );
4577 SIMDType xmm2( x1 * A.load(i,j+1UL) );
4581 xmm1 += x1 * A.load(i,j );
4582 xmm2 += x1 * A.load(i,j+1UL);
4585 y[j ] +=
sum( xmm1 ) * scalar;
4586 y[j+1UL] +=
sum( xmm2 ) * scalar;
4588 for( ; remainder && i<iend; ++i ) {
4589 y[j ] += x[i] * A(i,j ) * scalar;
4590 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4598 for( ++i; i<iend; ++i ) {
4599 value1 += x[i] * A(i,j );
4600 value2 += x[i] * A(i,j+1UL);
4603 y[j ] += value1 * scalar;
4604 y[j+1UL] += value2 * scalar;
4610 const size_t ibegin( ( IsLower_v<MT1> )
4613 const size_t iend( ( IsUpper_v<MT1> )
4614 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4625 SIMDType xmm1( x.load(i) * A.load(i,j) );
4628 xmm1 += A.load(i,j) * x.load(i);
4631 y[j] +=
sum( xmm1 ) * scalar;
4633 for( ; remainder && i<iend; ++i ) {
4634 y[j] += x[i] * A(i,j) * scalar;
4641 for( ++i; i<iend; ++i ) {
4642 value += x[i] * A(i,j);
4645 y[j] += value * scalar;
4665 template<
typename VT1
4669 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4670 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4672 selectDefaultAddAssignKernel( y, x, A, scalar );
4691 template<
typename VT1
4695 static inline auto selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4696 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
4698 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
4700 const size_t M( A.rows() );
4701 const size_t N( A.columns() );
4705 for( ; (j+8UL) <= N; j+=8UL )
4707 const size_t ibegin( ( IsLower_v<MT1> )
4710 const size_t iend( ( IsUpper_v<MT1> )
4711 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
4724 const SIMDType x1( x.load(i ) );
4725 const SIMDType x2( x.load(i1) );
4726 const SIMDType x3( x.load(i2) );
4727 const SIMDType x4( x.load(i3) );
4728 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4729 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4730 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4731 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4732 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4733 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4734 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4735 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4740 const SIMDType x1( x.load(i ) );
4741 const SIMDType x2( x.load(i1) );
4742 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4743 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4744 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4745 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4746 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4747 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4748 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4749 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4753 const SIMDType x1( x.load(i) );
4754 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4755 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4756 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4757 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4758 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4759 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4760 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4761 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4764 for( ; remainder && i<iend; ++i ) {
4765 y[j ] += x[i] * A(i,j ) * scalar;
4766 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4767 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4768 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4769 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4770 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4771 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4772 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4776 for( ; (j+4UL) <= N; j+=4UL )
4778 const size_t ibegin( ( IsLower_v<MT1> )
4781 const size_t iend( ( IsUpper_v<MT1> )
4782 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
4795 const SIMDType x1( x.load(i ) );
4796 const SIMDType x2( x.load(i1) );
4797 const SIMDType x3( x.load(i2) );
4798 const SIMDType x4( x.load(i3) );
4799 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4800 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4801 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4802 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4807 const SIMDType x1( x.load(i ) );
4808 const SIMDType x2( x.load(i1) );
4809 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4810 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4811 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4812 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4816 const SIMDType x1( x.load(i) );
4817 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4818 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4819 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4820 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4823 for( ; remainder && i<iend; ++i ) {
4824 y[j ] += x[i] * A(i,j ) * scalar;
4825 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4826 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4827 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4831 for( ; (j+2UL) <= N; j+=2UL )
4833 const size_t ibegin( ( IsLower_v<MT1> )
4836 const size_t iend( ( IsUpper_v<MT1> )
4837 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
4850 const SIMDType x1( x.load(i ) );
4851 const SIMDType x2( x.load(i1) );
4852 const SIMDType x3( x.load(i2) );
4853 const SIMDType x4( x.load(i3) );
4854 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4855 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4860 const SIMDType x1( x.load(i ) );
4861 const SIMDType x2( x.load(i1) );
4862 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4863 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4867 const SIMDType x1( x.load(i) );
4868 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4869 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4872 for( ; remainder && i<iend; ++i ) {
4873 y[j ] += x[i] * A(i,j ) * scalar;
4874 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4880 const size_t ibegin( ( IsLower_v<MT1> )
4883 const size_t iend( ( IsUpper_v<MT1> )
4884 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
4897 const SIMDType x1( x.load(i ) );
4898 const SIMDType x2( x.load(i1) );
4899 const SIMDType x3( x.load(i2) );
4900 const SIMDType x4( x.load(i3) );
4901 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4906 const SIMDType x1( x.load(i ) );
4907 const SIMDType x2( x.load(i1) );
4908 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4912 const SIMDType x1( x.load(i) );
4913 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4916 for( ; remainder && i<iend; ++i ) {
4917 y[j] += x[i] * A(i,j) * scalar;
4938 template<
typename VT1
4942 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4943 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4945 selectLargeAddAssignKernel( y, x, A, scalar );
4950#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4964 template<
typename VT1
4968 static inline auto selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4969 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
4971 using ET = ElementType_t<VT1>;
4973 if( IsTriangular_v<MT1> ) {
4974 ResultType_t<VT1> tmp(
serial( scalar * x ) );
4975 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
4976 addAssign( y, tmp );
4979 gemv( y, x, A,
ET(scalar),
ET(1) );
5001 template<
typename VT1
5003 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5009 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5010 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5012 if( right.rows() == 0UL || right.columns() == 0UL ||
5013 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5025 DVecScalarMultExpr::selectSubAssignKernel( *lhs, x, A, rhs.scalar_ );
5040 template<
typename VT1
5044 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5046 if( ( IsDiagonal_v<MT1> ) ||
5047 ( IsComputation_v<MT> && !evaluateMatrix ) ||
5048 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
5049 selectSmallSubAssignKernel( y, x, A, scalar );
5051 selectBlasSubAssignKernel( y, x, A, scalar );
5069 template<
typename VT1
5073 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5075 y.subAssign( x * A * scalar );
5093 template<
typename VT1
5097 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5098 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5100 selectDefaultSubAssignKernel( y, x, A, scalar );
5119 template<
typename VT1
5123 static inline auto selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5124 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5126 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
5128 const size_t M( A.rows() );
5129 const size_t N( A.columns() );
5133 for( ; (j+8UL) <= N; j+=8UL )
5135 const size_t ibegin( ( IsLower_v<MT1> )
5138 const size_t iend( ( IsUpper_v<MT1> )
5139 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
5150 SIMDType x1( x.load(i) );
5151 SIMDType xmm1( x1 * A.load(i,j ) );
5152 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5153 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5154 SIMDType xmm4( x1 * A.load(i,j+3UL) );
5155 SIMDType xmm5( x1 * A.load(i,j+4UL) );
5156 SIMDType xmm6( x1 * A.load(i,j+5UL) );
5157 SIMDType xmm7( x1 * A.load(i,j+6UL) );
5158 SIMDType xmm8( x1 * A.load(i,j+7UL) );
5162 xmm1 += x1 * A.load(i,j );
5163 xmm2 += x1 * A.load(i,j+1UL);
5164 xmm3 += x1 * A.load(i,j+2UL);
5165 xmm4 += x1 * A.load(i,j+3UL);
5166 xmm5 += x1 * A.load(i,j+4UL);
5167 xmm6 += x1 * A.load(i,j+5UL);
5168 xmm7 += x1 * A.load(i,j+6UL);
5169 xmm8 += x1 * A.load(i,j+7UL);
5172 y[j ] -=
sum( xmm1 ) * scalar;
5173 y[j+1UL] -=
sum( xmm2 ) * scalar;
5174 y[j+2UL] -=
sum( xmm3 ) * scalar;
5175 y[j+3UL] -=
sum( xmm4 ) * scalar;
5176 y[j+4UL] -=
sum( xmm5 ) * scalar;
5177 y[j+5UL] -=
sum( xmm6 ) * scalar;
5178 y[j+6UL] -=
sum( xmm7 ) * scalar;
5179 y[j+7UL] -=
sum( xmm8 ) * scalar;
5181 for( ; remainder && i<iend; ++i ) {
5182 y[j ] -= x[i] * A(i,j ) * scalar;
5183 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5184 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5185 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5186 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
5187 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
5188 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
5189 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
5203 for( ++i; i<iend; ++i ) {
5204 value1 += x[i] * A(i,j );
5205 value2 += x[i] * A(i,j+1UL);
5206 value3 += x[i] * A(i,j+2UL);
5207 value4 += x[i] * A(i,j+3UL);
5208 value5 += x[i] * A(i,j+4UL);
5209 value6 += x[i] * A(i,j+5UL);
5210 value7 += x[i] * A(i,j+6UL);
5211 value8 += x[i] * A(i,j+7UL);
5214 y[j ] -= value1 * scalar;
5215 y[j+1UL] -= value2 * scalar;
5216 y[j+2UL] -= value3 * scalar;
5217 y[j+3UL] -= value4 * scalar;
5218 y[j+4UL] -= value5 * scalar;
5219 y[j+5UL] -= value6 * scalar;
5220 y[j+6UL] -= value7 * scalar;
5221 y[j+7UL] -= value8 * scalar;
5225 for( ; (j+4UL) <= N; j+=4UL )
5227 const size_t ibegin( ( IsLower_v<MT1> )
5230 const size_t iend( ( IsUpper_v<MT1> )
5231 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
5242 SIMDType x1( x.load(i) );
5243 SIMDType xmm1( x1 * A.load(i,j ) );
5244 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5245 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5246 SIMDType xmm4( x1 * A.load(i,j+3UL) );
5250 xmm1 += x1 * A.load(i,j );
5251 xmm2 += x1 * A.load(i,j+1UL);
5252 xmm3 += x1 * A.load(i,j+2UL);
5253 xmm4 += x1 * A.load(i,j+3UL);
5256 y[j ] -=
sum( xmm1 ) * scalar;
5257 y[j+1UL] -=
sum( xmm2 ) * scalar;
5258 y[j+2UL] -=
sum( xmm3 ) * scalar;
5259 y[j+3UL] -=
sum( xmm4 ) * scalar;
5261 for( ; remainder && i<iend; ++i ) {
5262 y[j ] -= x[i] * A(i,j ) * scalar;
5263 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5264 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5265 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5275 for( ++i; i<iend; ++i ) {
5276 value1 += x[i] * A(i,j );
5277 value2 += x[i] * A(i,j+1UL);
5278 value3 += x[i] * A(i,j+2UL);
5279 value4 += x[i] * A(i,j+3UL);
5282 y[j ] -= value1 * scalar;
5283 y[j+1UL] -= value2 * scalar;
5284 y[j+2UL] -= value3 * scalar;
5285 y[j+3UL] -= value4 * scalar;
5289 for( ; (j+3UL) <= N; j+=3UL )
5291 const size_t ibegin( ( IsLower_v<MT1> )
5294 const size_t iend( ( IsUpper_v<MT1> )
5295 ?( IsStrictlyUpper_v<MT1> ? j+2UL : j+3UL )
5306 SIMDType x1( x.load(i) );
5307 SIMDType xmm1( x1 * A.load(i,j ) );
5308 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5309 SIMDType xmm3( x1 * A.load(i,j+2UL) );
5313 xmm1 += x1 * A.load(i,j );
5314 xmm2 += x1 * A.load(i,j+1UL);
5315 xmm3 += x1 * A.load(i,j+2UL);
5318 y[j ] -=
sum( xmm1 ) * scalar;
5319 y[j+1UL] -=
sum( xmm2 ) * scalar;
5320 y[j+2UL] -=
sum( xmm3 ) * scalar;
5322 for( ; remainder && i<iend; ++i ) {
5323 y[j ] -= x[i] * A(i,j ) * scalar;
5324 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5325 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5334 for( ++i; i<iend; ++i ) {
5335 value1 += x[i] * A(i,j );
5336 value2 += x[i] * A(i,j+1UL);
5337 value3 += x[i] * A(i,j+2UL);
5340 y[j ] -= value1 * scalar;
5341 y[j+1UL] -= value2 * scalar;
5342 y[j+2UL] -= value3 * scalar;
5346 for( ; (j+2UL) <= N; j+=2UL )
5348 const size_t ibegin( ( IsLower_v<MT1> )
5351 const size_t iend( ( IsUpper_v<MT1> )
5352 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
5363 SIMDType x1( x.load(i) );
5364 SIMDType xmm1( x1 * A.load(i,j ) );
5365 SIMDType xmm2( x1 * A.load(i,j+1UL) );
5369 xmm1 += x1 * A.load(i,j );
5370 xmm2 += x1 * A.load(i,j+1UL);
5373 y[j ] -=
sum( xmm1 ) * scalar;
5374 y[j+1UL] -=
sum( xmm2 ) * scalar;
5376 for( ; remainder && i<iend; ++i ) {
5377 y[j ] -= x[i] * A(i,j ) * scalar;
5378 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5386 for( ++i; i<iend; ++i ) {
5387 value1 += x[i] * A(i,j );
5388 value2 += x[i] * A(i,j+1UL);
5391 y[j ] -= value1 * scalar;
5392 y[j+1UL] -= value2 * scalar;
5398 const size_t ibegin( ( IsLower_v<MT1> )
5401 const size_t iend( ( IsUpper_v<MT1> )
5402 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
5413 SIMDType xmm1( x.load(i) * A.load(i,j ) );
5416 xmm1 += A.load(i,j) * x.load(i);
5419 y[j] -=
sum( xmm1 ) * scalar;
5421 for( ; remainder && i<iend; ++i ) {
5422 y[j] -= x[i] * A(i,j) * scalar;
5429 for( ++i; i<iend; ++i ) {
5430 value += x[i] * A(i,j);
5433 y[j] -= value * scalar;
5453 template<
typename VT1
5457 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5458 -> DisableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5460 selectDefaultSubAssignKernel( y, x, A, scalar );
5479 template<
typename VT1
5483 static inline auto selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5484 -> EnableIf_t< UseVectorizedDefaultKernel_v<VT1,VT2,MT1,ST2> >
5486 constexpr bool remainder( !IsPadded_v<VT2> || !IsPadded_v<MT1> );
5488 const size_t M( A.rows() );
5489 const size_t N( A.columns() );
5493 for( ; (j+8UL) <= N; j+=8UL )
5495 const size_t ibegin( ( IsLower_v<MT1> )
5498 const size_t iend( ( IsUpper_v<MT1> )
5499 ?( IsStrictlyUpper_v<MT1> ? j+7UL : j+8UL )
5512 const SIMDType x1( x.load(i ) );
5513 const SIMDType x2( x.load(i1) );
5514 const SIMDType x3( x.load(i2) );
5515 const SIMDType x4( x.load(i3) );
5516 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5517 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5518 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
5519 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
5520 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
5521 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
5522 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
5523 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
5528 const SIMDType x1( x.load(i ) );
5529 const SIMDType x2( x.load(i1) );
5530 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5531 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5532 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
5533 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
5534 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
5535 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
5536 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
5537 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
5541 const SIMDType x1( x.load(i) );
5542 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
5543 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
5544 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
5545 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
5546 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
5547 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
5548 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
5549 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
5552 for( ; remainder && i<iend; ++i ) {
5553 y[j ] -= x[i] * A(i,j ) * scalar;
5554 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5555 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5556 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5557 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
5558 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
5559 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
5560 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
5564 for( ; (j+4UL) <= N; j+=4UL )
5566 const size_t ibegin( ( IsLower_v<MT1> )
5569 const size_t iend( ( IsUpper_v<MT1> )
5570 ?( IsStrictlyUpper_v<MT1> ? j+3UL : j+4UL )
5583 const SIMDType x1( x.load(i ) );
5584 const SIMDType x2( x.load(i1) );
5585 const SIMDType x3( x.load(i2) );
5586 const SIMDType x4( x.load(i3) );
5587 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5588 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5589 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
5590 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
5595 const SIMDType x1( x.load(i ) );
5596 const SIMDType x2( x.load(i1) );
5597 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5598 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5599 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
5600 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
5604 const SIMDType x1( x.load(i) );
5605 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
5606 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
5607 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
5608 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
5611 for( ; remainder && i<iend; ++i ) {
5612 y[j ] -= x[i] * A(i,j ) * scalar;
5613 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5614 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
5615 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
5619 for( ; (j+2UL) <= N; j+=2UL )
5621 const size_t ibegin( ( IsLower_v<MT1> )
5624 const size_t iend( ( IsUpper_v<MT1> )
5625 ?( IsStrictlyUpper_v<MT1> ? j+1UL : j+2UL )
5638 const SIMDType x1( x.load(i ) );
5639 const SIMDType x2( x.load(i1) );
5640 const SIMDType x3( x.load(i2) );
5641 const SIMDType x4( x.load(i3) );
5642 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
5643 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
5648 const SIMDType x1( x.load(i ) );
5649 const SIMDType x2( x.load(i1) );
5650 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
5651 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
5655 const SIMDType x1( x.load(i) );
5656 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
5657 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
5660 for( ; remainder && i<iend; ++i ) {
5661 y[j ] -= x[i] * A(i,j ) * scalar;
5662 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
5668 const size_t ibegin( ( IsLower_v<MT1> )
5671 const size_t iend( ( IsUpper_v<MT1> )
5672 ?( IsStrictlyUpper_v<MT1> ? j : j+1UL )
5685 const SIMDType x1( x.load(i ) );
5686 const SIMDType x2( x.load(i1) );
5687 const SIMDType x3( x.load(i2) );
5688 const SIMDType x4( x.load(i3) );
5689 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
5694 const SIMDType x1( x.load(i ) );
5695 const SIMDType x2( x.load(i1) );
5696 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
5700 const SIMDType x1( x.load(i) );
5701 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
5704 for( ; remainder && i<iend; ++i ) {
5705 y[j] -= x[i] * A(i,j) * scalar;
5726 template<
typename VT1
5730 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5731 -> DisableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
5733 selectLargeSubAssignKernel( y, x, A, scalar );
5738#if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
5752 template<
typename VT1
5756 static inline auto selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5757 -> EnableIf_t< UseBlasKernel_v<VT1,VT2,MT1,ST2> >
5759 using ET = ElementType_t<VT1>;
5761 if( IsTriangular_v<MT1> ) {
5762 ResultType_t<VT1> tmp(
serial( scalar * x ) );
5763 trmv( tmp, A, ( IsLower_v<MT1> )?( CblasLower ):( CblasUpper ) );
5764 subAssign( y, tmp );
5767 gemv( y, x, A,
ET(-scalar),
ET(1) );
5789 template<
typename VT1
5791 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5802 multAssign( *lhs, tmp );
5822 template<
typename VT1
5824 friend inline void divAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5835 divAssign( *lhs, tmp );
5857 template<
typename VT1
5860 -> EnableIf_t< UseSMPAssign_v<VT1> >
5866 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5867 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5869 if( right.rows() == 0UL ||
5870 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5874 else if( right.columns() == 0UL ) {
5904 template<
typename VT1
5907 -> EnableIf_t< UseSMPAssign_v<VT1> >
5936 template<
typename VT1
5939 -> EnableIf_t< UseSMPAssign_v<VT1> >
5945 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5946 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5948 if( right.rows() == 0UL || right.columns() == 0UL ||
5949 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
5983 template<
typename VT1
5986 -> EnableIf_t< UseSMPAssign_v<VT1> >
5992 LeftOperand_t<VMM> left ( rhs.vector_.leftOperand() );
5993 RightOperand_t<VMM> right( rhs.vector_.rightOperand() );
5995 if( right.rows() == 0UL || right.columns() == 0UL ||
5996 ( IsStrictlyTriangular_v<MT> && right.rows() == 1UL ) ) {
6030 template<
typename VT1
6033 -> EnableIf_t< UseSMPAssign_v<VT1> >
6066 template<
typename VT1
6069 -> EnableIf_t< UseSMPAssign_v<VT1> >
6142template<
typename VT
6144inline decltype(
auto)
6151 if( (*vec).size() != (*mat).rows() ) {
6156 return ReturnType( *vec, *mat );
6171template<
typename VT,
typename MT >
6172struct IsAligned< TDVecTDMatMultExpr<VT,MT> >
6173 :
public BoolConstant< IsAligned_v<VT> && IsAligned_v<MT> >
Header file for auxiliary alias declarations.
typename T::CompositeType CompositeType_t
Alias declaration for nested CompositeType type definitions.
Definition: Aliases.h:110
typename T::ResultType ResultType_t
Alias declaration for nested ResultType type definitions.
Definition: Aliases.h:450
typename T::ElementType ElementType_t
Alias declaration for nested ElementType type definitions.
Definition: Aliases.h:190
typename T::TransposeType TransposeType_t
Alias declaration for nested TransposeType type definitions.
Definition: Aliases.h:550
Header file for run time assertion macros.
Header file for the blaze::checked and blaze::unchecked instances.
Constraints on the storage order of matrix types.
Header file for the complex data type.
Header file for the EnableIf class template.
Header file for the function trace functionality.
Header file for the HasConstDataAccess type trait.
Header file for the HasMutableDataAccess type trait.
Header file for the HasSIMDAdd type trait.
Header file for the HasSIMDMult type trait.
Header file for the If class template.
Header file for the IntegralConstant class template.
Header file for the IsAligned type trait.
Header file for the IsBLASCompatible type trait.
Header file for the IsBuiltin type trait.
Header file for the IsComplexDouble type trait.
Header file for the IsComplexFloat type trait.
Header file for the IsComplex type trait.
Header file for the IsComputation type trait class.
Header file for the IsContiguous type trait.
Header file for the IsDiagonal type trait.
Header file for the IsDouble type trait.
Header file for the IsExpression type trait class.
Header file for the IsFloat type trait.
Header file for the IsLower type trait.
Header file for the IsPadded type trait.
Header file for the IsSIMDCombinable type trait.
Header file for the IsSame and IsStrictlySame type traits.
Header file for the IsStrictlyLower type trait.
Header file for the IsStrictlyTriangular type trait.
Header file for the IsStrictlyUpper type trait.
Header file for the IsTriangular type trait.
Header file for the IsUpper type trait.
Deactivation of problematic macros.
Header file for the multiplication trait.
Header file for the prevMultiple shim.
Constraint on the transpose flag of vector types.
Header file for all SIMD functionality.
Constraint on the data type.
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: DVecScalarMultExpr.h:530
LeftOperand vector_
Left-hand side dense vector of the multiplication expression.
Definition: DVecScalarMultExpr.h:591
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: DVecScalarMultExpr.h:430
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DVecScalarMultExpr.h:461
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: DVecScalarMultExpr.h:520
RightOperand rightOperand() const noexcept
Returns the right-hand side scalar operand.
Definition: DVecScalarMultExpr.h:540
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: DVecScalarMultExpr.h:584
DVecScalarMultExpr< VT, ST, TF > This
Type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:163
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: DVecScalarMultExpr.h:169
VecScalarMultExpr< DenseVector< This, TF > > BaseType
Base type of this DVecScalarMultExpr instance.
Definition: DVecScalarMultExpr.h:166
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: DVecScalarMultExpr.h:440
ResultType_t< VT > RT
Result type of the dense vector expression.
Definition: DVecScalarMultExpr.h:110
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DVecScalarMultExpr.h:474
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: DVecScalarMultExpr.h:552
If_t< useAssign, const ResultType, const DVecScalarMultExpr & > CompositeType
Data type for composite expression templates.
Definition: DVecScalarMultExpr.h:176
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: DVecScalarMultExpr.h:179
ElementType_t< VT > ET
Element type of the dense vector expression.
Definition: DVecScalarMultExpr.h:112
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: DVecScalarMultExpr.h:170
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: DVecScalarMultExpr.h:435
DVecScalarMultExpr(const VT &vector, ST scalar) noexcept
Constructor for the DVecScalarMultExpr class.
Definition: DVecScalarMultExpr.h:449
ST RightOperand
Composite type of the right-hand side scalar value.
Definition: DVecScalarMultExpr.h:182
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: DVecScalarMultExpr.h:574
MultTrait_t< RT, ST > ResultType
Result type for expression template evaluations.
Definition: DVecScalarMultExpr.h:168
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: DVecScalarMultExpr.h:564
RightOperand scalar_
Right-hand side scalar of the multiplication expression.
Definition: DVecScalarMultExpr.h:592
const If_t< returnExpr, ExprReturnType, ElementType > ReturnType
Return type for expression template evaluations.
Definition: DVecScalarMultExpr.h:173
Base class for dense matrices.
Definition: DenseMatrix.h:82
Base class for N-dimensional dense vectors.
Definition: DenseVector.h:77
SIMD characteristics of data types.
Definition: SIMDTrait.h:297
Expression object for transpose dense vector-transpose dense matrix multiplications.
Definition: TDVecTDMatMultExpr.h:126
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:313
ElementType_t< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:131
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:300
ResultType_t< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:129
TransposeType_t< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:209
static constexpr size_t SIMDSIZE
The number of elements packed within a single SIMD element.
Definition: TDVecTDMatMultExpr.h:243
CompositeType_t< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:134
static constexpr bool smpAssignable
Compilation switch for the expression template assignment strategy.
Definition: TDVecTDMatMultExpr.h:237
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:212
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:389
If_t< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:222
ElementType_t< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:132
ResultType_t< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:130
static constexpr bool simdEnabled
Compilation switch for the expression template evaluation strategy.
Definition: TDVecTDMatMultExpr.h:230
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:323
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:357
static constexpr bool evaluateMatrix
Compilation switch for the composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:144
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:345
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:266
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:367
If_t< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:225
CompositeType_t< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:133
MultTrait_t< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:208
ElementType_t< ResultType > ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:210
If_t< IsExpression_v< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:219
RightOperand rightOperand() const noexcept
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:333
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:213
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:390
static constexpr bool evaluateVector
Compilation switch for the composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:139
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:377
SIMDTrait_t< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecTDMatMultExpr.h:211
If_t< IsExpression_v< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:216
TDVecTDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:252
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Constraint on the data type.
Header file for the Computation base class.
Header file for the DenseVector base class.
Header file for the TVecMatMultExpr base class.
Header file for the VecScalarMultExpr base class.
Header file for BLAS general matrix/vector multiplication functions (gemv)
decltype(auto) column(Matrix< MT, SO > &matrix, RCAs... args)
Creating a view on a specific column of the given matrix.
Definition: Column.h:137
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.
Definition: BLAS.h:68
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.
Definition: BLAS.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).
Definition: BLAS.h:169
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.
Definition: SameType.h:71
decltype(auto) serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:812
decltype(auto) sum(const DenseMatrix< MT, SO > &dm)
Reduces the given dense matrix by means of addition.
Definition: DMatReduceExpr.h:2156
#define BLAZE_CONSTRAINT_MUST_NOT_REQUIRE_EVALUATION(T)
Constraint on the data type.
Definition: RequiresEvaluation.h:81
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.
Definition: TVecMatMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_NOT_BE_MATMATMULTEXPR_TYPE(T)
Constraint on the data type.
Definition: MatMatMultExpr.h:83
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.
Definition: DenseMatrix.h:61
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_SCALAR_TYPE(T)
Constraint on the data type.
Definition: Scalar.h:61
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.
Definition: RowVector.h:61
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.
Definition: ColumnMajorMatrix.h:61
typename MultTrait< T1, T2 >::Type MultTrait_t
Auxiliary alias declaration for the MultTrait class template.
Definition: MultTrait.h:165
BLAZE_ALWAYS_INLINE constexpr auto prevMultiple(T1 value, T2 factor) noexcept
Rounds down an integral value to the previous multiple of a given factor.
Definition: PrevMultiple.h:68
constexpr void reset(Matrix< MT, SO > &matrix)
Resetting the given matrix.
Definition: Matrix.h:806
MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:518
constexpr size_t size(const Matrix< MT, SO > &matrix) noexcept
Returns the total number of elements of the matrix.
Definition: Matrix.h:676
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.
Definition: Assert.h:101
typename SIMDTrait< T >::Type SIMDTrait_t
Auxiliary alias declaration for the SIMDTrait class template.
Definition: SIMDTrait.h:315
auto smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:221
auto smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:162
auto smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:100
auto smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs) -> EnableIf_t< IsDenseVector_v< VT1 > >
Default implementation of the SMP multiplication assignment of a vector to a dense vector.
Definition: DenseVector.h:192
auto smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs) -> EnableIf_t< IsDenseMatrix_v< MT1 > >
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:131
decltype(auto) subvector(Vector< VT, TF > &, RSAs...)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:158
typename If< Condition >::template Type< T1, T2 > If_t
Auxiliary alias template for the If class template.
Definition: If.h:108
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.
Definition: IntegralConstant.h:110
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.
Definition: Exception.h:331
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.
Definition: Exception.h:235
#define BLAZE_FUNCTION_TRACE
Function trace macro.
Definition: FunctionTrace.h:94
constexpr Unchecked unchecked
Global Unchecked instance.
Definition: Check.h:146
Header file for the exception macros of the math module.
Header file for all forward declarations for expression class templates.
Header file for the reset shim.
Header file for the serial shim.
Base class for all compute expression templates.
Definition: Computation.h:68
Base class for all vector/matrix multiplication expression templates.
Definition: TVecMatMultExpr.h:69
System settings for the BLAS mode.
System settings for performance optimizations.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
Header file for the RequiresEvaluation type trait.
Header file for basic type definitions.