35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
124 template<
typename VT
126 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
127 ,
private TVecMatMultExpr
128 ,
private Computation
157 template<
typename T1 >
158 struct UseSMPAssign {
159 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
169 template<
typename T1,
typename T2,
typename T3 >
170 struct UseBlasKernel {
172 HasMutableDataAccess<T1>::value &&
173 HasConstDataAccess<T2>::value &&
174 HasConstDataAccess<T3>::value &&
175 !IsDiagonal<T3>::value &&
176 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
177 IsBLASCompatible< ElementType_<T1> >::value &&
178 IsBLASCompatible< ElementType_<T2> >::value &&
179 IsBLASCompatible< ElementType_<T3> >::value &&
180 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
181 IsSame< ElementType_<T1>, ElementType_<T3> >::value };
192 template<
typename T1,
typename T2,
typename T3 >
193 struct UseVectorizedDefaultKernel {
195 !IsDiagonal<T3>::value &&
196 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
197 AreSIMDCombinable< ElementType_<T1>
199 , ElementType_<T3> >::value &&
200 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
201 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
232 VT::simdEnabled && MT::simdEnabled &&
237 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
238 !evaluateMatrix && MT::smpAssignable };
271 return vec_[index] *
mat_(index,index);
298 inline ReturnType
at(
size_t index )
const {
299 if( index >=
mat_.columns() ) {
302 return (*
this)[index];
311 inline size_t size() const noexcept {
312 return mat_.columns();
342 template<
typename T >
343 inline bool canAlias(
const T* alias )
const noexcept {
344 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
354 template<
typename T >
355 inline bool isAliased(
const T* alias )
const noexcept {
356 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
366 return vec_.isAligned() &&
mat_.isAligned();
378 (
mat_.rows() *
mat_.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
379 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
402 template<
typename VT1 >
409 if( rhs.mat_.rows() == 0UL ) {
413 else if( rhs.mat_.columns() == 0UL ) {
417 LT x(
serial( rhs.vec_ ) );
418 RT A(
serial( rhs.mat_ ) );
425 TDVecDMatMultExpr::selectAssignKernel( ~lhs, x, A );
441 template<
typename VT1
444 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
448 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
449 selectSmallAssignKernel( y, x, A );
451 selectBlasAssignKernel( y, x, A );
470 template<
typename VT1
473 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
475 const size_t M( A.rows() );
476 const size_t N( A.columns() );
478 if( IsStrictlyUpper<MT1>::value ) {
482 if( !IsLower<MT1>::value )
484 const size_t jbegin( IsStrictlyUpper<MT1>::value ? 1UL : 0UL );
485 for(
size_t j=jbegin; j<N; ++j ) {
486 y[j] = x[0UL] * A(0UL,j);
490 for(
size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
492 if( IsDiagonal<MT1>::value )
494 y[i] = x[i] * A(i,i);
498 const size_t jbegin( ( IsUpper<MT1>::value )
499 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
501 const size_t jend( ( IsLower<MT1>::value )
502 ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
506 const size_t jnum( jend - jbegin );
507 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
509 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
510 y[j ] += x[i] * A(i,j );
511 y[j+1UL] += x[i] * A(i,j+1UL);
514 y[jpos] += x[i] * A(i,jpos);
516 if( IsLower<MT1>::value ) {
517 y[jend] = x[i] * A(i,jend);
522 if( IsStrictlyLower<MT1>::value ) {
543 template<
typename VT1
546 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
547 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
549 selectDefaultAssignKernel( y, x, A );
568 template<
typename VT1
571 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
572 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
574 const size_t M( A.rows() );
575 const size_t N( A.columns() );
577 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
579 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
584 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
586 const size_t ibegin( ( IsLower<MT1>::value )
587 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
589 const size_t iend( ( IsUpper<MT1>::value )
590 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
594 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
596 for(
size_t i=ibegin; i<iend; ++i ) {
597 const SIMDType x1(
set( x[i] ) );
598 xmm1 = xmm1 + x1 * A.load(i,j );
599 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
600 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
601 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
602 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
603 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
604 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
605 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
609 y.store( j+SIMDSIZE , xmm2 );
610 y.store( j+SIMDSIZE*2UL, xmm3 );
611 y.store( j+SIMDSIZE*3UL, xmm4 );
612 y.store( j+SIMDSIZE*4UL, xmm5 );
613 y.store( j+SIMDSIZE*5UL, xmm6 );
614 y.store( j+SIMDSIZE*6UL, xmm7 );
615 y.store( j+SIMDSIZE*7UL, xmm8 );
618 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
620 const size_t ibegin( ( IsLower<MT1>::value )
621 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
623 const size_t iend( ( IsUpper<MT1>::value )
624 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
628 SIMDType xmm1, xmm2, xmm3, xmm4;
630 for(
size_t i=ibegin; i<iend; ++i ) {
631 const SIMDType x1(
set( x[i] ) );
632 xmm1 = xmm1 + x1 * A.load(i,j );
633 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
634 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
635 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
639 y.store( j+SIMDSIZE , xmm2 );
640 y.store( j+SIMDSIZE*2UL, xmm3 );
641 y.store( j+SIMDSIZE*3UL, xmm4 );
644 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
646 const size_t ibegin( ( IsLower<MT1>::value )
647 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
649 const size_t iend( ( IsUpper<MT1>::value )
650 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
654 SIMDType xmm1, xmm2, xmm3;
656 for(
size_t i=ibegin; i<iend; ++i ) {
657 const SIMDType x1(
set( x[i] ) );
658 xmm1 = xmm1 + x1 * A.load(i,j );
659 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
660 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
664 y.store( j+SIMDSIZE , xmm2 );
665 y.store( j+SIMDSIZE*2UL, xmm3 );
668 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
670 const size_t ibegin( ( IsLower<MT1>::value )
671 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
673 const size_t iend( ( IsUpper<MT1>::value )
674 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
680 for(
size_t i=ibegin; i<iend; ++i ) {
681 const SIMDType x1(
set( x[i] ) );
682 xmm1 = xmm1 + x1 * A.load(i,j );
683 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
687 y.store( j+SIMDSIZE, xmm2 );
690 for( ; j<jpos; j+=SIMDSIZE )
692 const size_t ibegin( ( IsLower<MT1>::value )
693 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
695 const size_t iend( ( IsUpper<MT1>::value )
696 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
702 for(
size_t i=ibegin; i<iend; ++i ) {
703 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
709 for( ; remainder && j<N; ++j )
711 const size_t ibegin( ( IsLower<MT1>::value )
712 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
714 const size_t iend( ( IsUpper<MT1>::value )
715 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
721 for(
size_t i=ibegin; i<iend; ++i ) {
722 value += x[i] * A(i,j);
745 template<
typename VT1
748 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
749 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
751 selectDefaultAssignKernel( y, x, A );
770 template<
typename VT1
773 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
774 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
776 const size_t M( A.rows() );
777 const size_t N( A.columns() );
779 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
781 const size_t jblock( 32768UL /
sizeof( ElementType ) );
782 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
788 for(
size_t jj=0U; jj<N; jj+=jblock ) {
789 for(
size_t ii=0UL; ii<M; ii+=iblock )
791 const size_t iend(
min( ii+iblock, M ) );
792 const size_t jtmp(
min( jj+jblock, N ) );
793 const size_t jend( ( IsLower<MT1>::value )
794 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
797 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
798 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
800 size_t j( ( IsUpper<MT1>::value )
801 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
804 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
806 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
808 for(
size_t i=ii; i<iend; ++i ) {
809 const SIMDType x1(
set( x[i] ) );
810 xmm1 = xmm1 + x1 * A.load(i,j );
811 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
812 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
813 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
814 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
815 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
816 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
817 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
820 y.store( j , y.load(j ) + xmm1 );
821 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
822 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
823 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
824 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
825 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
826 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
827 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
830 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
832 SIMDType xmm1, xmm2, xmm3, xmm4;
834 for(
size_t i=ii; i<iend; ++i ) {
835 const SIMDType x1(
set( x[i] ) );
836 xmm1 = xmm1 + x1 * A.load(i,j );
837 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
838 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
839 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
842 y.store( j , y.load(j ) + xmm1 );
843 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
844 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
845 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
848 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
850 SIMDType xmm1, xmm2, xmm3;
852 for(
size_t i=ii; i<iend; ++i ) {
853 const SIMDType x1(
set( x[i] ) );
854 xmm1 = xmm1 + x1 * A.load(i,j );
855 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
856 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
859 y.store( j , y.load(j ) + xmm1 );
860 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
861 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
864 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
868 for(
size_t i=ii; i<iend; ++i ) {
869 const SIMDType x1(
set( x[i] ) );
870 xmm1 = xmm1 + x1 * A.load(i,j );
871 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
874 y.store( j , y.load(j ) + xmm1 );
875 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
878 for( ; j<jpos; j+=SIMDSIZE )
882 for(
size_t i=ii; i<iend; ++i ) {
883 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
886 y.store( j, y.load(j) + xmm1 );
889 for( ; remainder && j<jend; ++j )
893 for(
size_t i=ii; i<iend; ++i ) {
894 value += x[i] * A(i,j);
919 template<
typename VT1
922 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
923 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
925 selectLargeAssignKernel( y, x, A );
931 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
945 template<
typename VT1
948 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
949 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
951 typedef ElementType_<VT1> ET;
953 if( IsTriangular<MT1>::value ) {
955 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
958 gemv( y, x, A, ET(1), ET(0) );
978 template<
typename VT1 >
979 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
989 const ResultType tmp(
serial( rhs ) );
1008 template<
typename VT1 >
1009 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1015 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1019 LT x(
serial( rhs.vec_ ) );
1020 RT A(
serial( rhs.mat_ ) );
1027 TDVecDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1043 template<
typename VT1
1046 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1048 if( ( IsDiagonal<MT1>::value ) ||
1049 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1050 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1051 selectSmallAddAssignKernel( y, x, A );
1053 selectBlasAddAssignKernel( y, x, A );
1072 template<
typename VT1
1075 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1077 const size_t M( A.rows() );
1078 const size_t N( A.columns() );
1080 for(
size_t i=0UL; i<M; ++i )
1082 if( IsDiagonal<MT1>::value )
1084 y[i] += x[i] * A(i,i);
1088 const size_t jbegin( ( IsUpper<MT1>::value )
1089 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1091 const size_t jend( ( IsLower<MT1>::value )
1092 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1096 const size_t jnum( jend - jbegin );
1097 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1099 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1100 y[j ] += x[i] * A(i,j );
1101 y[j+1UL] += x[i] * A(i,j+1UL);
1104 y[jpos] += x[i] * A(i,jpos);
1126 template<
typename VT1
1129 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1130 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1132 selectDefaultAddAssignKernel( y, x, A );
1151 template<
typename VT1
1154 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1155 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1157 const size_t M( A.rows() );
1158 const size_t N( A.columns() );
1160 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1162 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1167 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1169 const size_t ibegin( ( IsLower<MT1>::value )
1170 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1172 const size_t iend( ( IsUpper<MT1>::value )
1173 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1177 SIMDType xmm1( y.load(j ) );
1178 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1179 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1180 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1181 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1182 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1183 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1184 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1186 for(
size_t i=ibegin; i<iend; ++i ) {
1187 const SIMDType x1(
set( x[i] ) );
1188 xmm1 = xmm1 + x1 * A.load(i,j );
1189 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1190 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1191 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1192 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1193 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1194 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1195 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1198 y.store( j , xmm1 );
1199 y.store( j+SIMDSIZE , xmm2 );
1200 y.store( j+SIMDSIZE*2UL, xmm3 );
1201 y.store( j+SIMDSIZE*3UL, xmm4 );
1202 y.store( j+SIMDSIZE*4UL, xmm5 );
1203 y.store( j+SIMDSIZE*5UL, xmm6 );
1204 y.store( j+SIMDSIZE*6UL, xmm7 );
1205 y.store( j+SIMDSIZE*7UL, xmm8 );
1208 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1210 const size_t ibegin( ( IsLower<MT1>::value )
1211 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1213 const size_t iend( ( IsUpper<MT1>::value )
1214 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1218 SIMDType xmm1( y.load(j ) );
1219 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1220 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1221 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1223 for(
size_t i=ibegin; i<iend; ++i ) {
1224 const SIMDType x1(
set( x[i] ) );
1225 xmm1 = xmm1 + x1 * A.load(i,j );
1226 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1227 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1228 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1231 y.store( j , xmm1 );
1232 y.store( j+SIMDSIZE , xmm2 );
1233 y.store( j+SIMDSIZE*2UL, xmm3 );
1234 y.store( j+SIMDSIZE*3UL, xmm4 );
1237 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1239 const size_t ibegin( ( IsLower<MT1>::value )
1240 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1242 const size_t iend( ( IsUpper<MT1>::value )
1243 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1247 SIMDType xmm1( y.load(j ) );
1248 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1249 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1251 for(
size_t i=ibegin; i<iend; ++i ) {
1252 const SIMDType x1(
set( x[i] ) );
1253 xmm1 = xmm1 + x1 * A.load(i,j );
1254 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1255 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1258 y.store( j , xmm1 );
1259 y.store( j+SIMDSIZE , xmm2 );
1260 y.store( j+SIMDSIZE*2UL, xmm3 );
1263 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1265 const size_t ibegin( ( IsLower<MT1>::value )
1266 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1268 const size_t iend( ( IsUpper<MT1>::value )
1269 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1273 SIMDType xmm1( y.load(j ) );
1274 SIMDType xmm2( y.load(j+SIMDSIZE) );
1276 for(
size_t i=ibegin; i<iend; ++i ) {
1277 const SIMDType x1(
set( x[i] ) );
1278 xmm1 = xmm1 + x1 * A.load(i,j );
1279 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
1282 y.store( j , xmm1 );
1283 y.store( j+SIMDSIZE, xmm2 );
1286 for( ; j<jpos; j+=SIMDSIZE )
1288 const size_t ibegin( ( IsLower<MT1>::value )
1289 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1291 const size_t iend( ( IsUpper<MT1>::value )
1292 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1296 SIMDType xmm1( y.load(j) );
1298 for(
size_t i=ibegin; i<iend; ++i ) {
1299 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
1305 for( ; remainder && j<N; ++j )
1307 const size_t ibegin( ( IsLower<MT1>::value )
1308 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1310 const size_t iend( ( IsUpper<MT1>::value )
1311 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1317 for(
size_t i=ibegin; i<iend; ++i ) {
1318 value += x[i] * A(i,j);
1341 template<
typename VT1
1344 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1345 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1347 selectDefaultAddAssignKernel( y, x, A );
1366 template<
typename VT1
1369 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1370 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1372 const size_t M( A.rows() );
1373 const size_t N( A.columns() );
1375 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1377 const size_t jblock( 32768UL /
sizeof( ElementType ) );
1378 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1382 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1383 for(
size_t ii=0UL; ii<M; ii+=iblock )
1385 const size_t iend(
min( ii+iblock, M ) );
1386 const size_t jtmp(
min( jj+jblock, N ) );
1387 const size_t jend( ( IsLower<MT1>::value )
1388 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1391 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1392 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1394 size_t j( ( IsUpper<MT1>::value )
1395 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
1398 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1400 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1402 for(
size_t i=ii; i<iend; ++i ) {
1403 const SIMDType x1(
set( x[i] ) );
1404 xmm1 = xmm1 + x1 * A.load(i,j );
1405 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1406 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1407 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1408 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1409 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1410 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1411 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1414 y.store( j , y.load(j ) + xmm1 );
1415 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1416 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1417 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1418 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5 );
1419 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6 );
1420 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7 );
1421 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8 );
1424 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1426 SIMDType xmm1, xmm2, xmm3, xmm4;
1428 for(
size_t i=ii; i<iend; ++i ) {
1429 const SIMDType x1(
set( x[i] ) );
1430 xmm1 = xmm1 + x1 * A.load(i,j );
1431 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1432 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1433 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1436 y.store( j , y.load(j ) + xmm1 );
1437 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1438 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1439 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4 );
1442 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1444 SIMDType xmm1, xmm2, xmm3;
1446 for(
size_t i=ii; i<iend; ++i ) {
1447 const SIMDType x1(
set( x[i] ) );
1448 xmm1 = xmm1 + x1 * A.load(i,j );
1449 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1450 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1453 y.store( j , y.load(j ) + xmm1 );
1454 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2 );
1455 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3 );
1458 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1460 SIMDType xmm1, xmm2;
1462 for(
size_t i=ii; i<iend; ++i ) {
1463 const SIMDType x1(
set( x[i] ) );
1464 xmm1 = xmm1 + x1 * A.load(i,j );
1465 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
1468 y.store( j , y.load(j ) + xmm1 );
1469 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2 );
1472 for( ; j<jpos; j+=SIMDSIZE )
1476 for(
size_t i=ii; i<iend; ++i ) {
1477 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
1480 y.store( j, y.load(j) + xmm1 );
1483 for( ; remainder && j<jend; ++j )
1487 for(
size_t i=ii; i<iend; ++i ) {
1488 value += x[i] * A(i,j);
1513 template<
typename VT1
1516 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
1517 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1519 selectLargeAddAssignKernel( y, x, A );
1525 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1539 template<
typename VT1
1542 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
1543 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1545 typedef ElementType_<VT1> ET;
1547 if( IsTriangular<MT1>::value ) {
1548 ResultType_<VT1> tmp(
serial( x ) );
1549 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1550 addAssign( y, tmp );
1553 gemv( y, x, A, ET(1), ET(1) );
1577 template<
typename VT1 >
1578 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
1584 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1588 LT x(
serial( rhs.vec_ ) );
1589 RT A(
serial( rhs.mat_ ) );
1596 TDVecDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1612 template<
typename VT1
1615 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1617 if( ( IsDiagonal<MT1>::value ) ||
1618 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1619 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
1620 selectSmallSubAssignKernel( y, x, A );
1622 selectBlasSubAssignKernel( y, x, A );
1641 template<
typename VT1
1644 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1646 const size_t M( A.rows() );
1647 const size_t N( A.columns() );
1649 for(
size_t i=0UL; i<M; ++i )
1651 if( IsDiagonal<MT1>::value )
1653 y[i] -= x[i] * A(i,i);
1657 const size_t jbegin( ( IsUpper<MT1>::value )
1658 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1660 const size_t jend( ( IsLower<MT1>::value )
1661 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1665 const size_t jnum( jend - jbegin );
1666 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
1668 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
1669 y[j ] -= x[i] * A(i,j );
1670 y[j+1UL] -= x[i] * A(i,j+1UL);
1673 y[jpos] -= x[i] * A(i,jpos);
1695 template<
typename VT1
1698 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1699 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1701 selectDefaultSubAssignKernel( y, x, A );
1721 template<
typename VT1
1724 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1725 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1727 const size_t M( A.rows() );
1728 const size_t N( A.columns() );
1730 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1732 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
1737 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1739 const size_t ibegin( ( IsLower<MT1>::value )
1740 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1742 const size_t iend( ( IsUpper<MT1>::value )
1743 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1747 SIMDType xmm1( y.load(j ) );
1748 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1749 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1750 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1751 SIMDType xmm5( y.load(j+SIMDSIZE*4UL) );
1752 SIMDType xmm6( y.load(j+SIMDSIZE*5UL) );
1753 SIMDType xmm7( y.load(j+SIMDSIZE*6UL) );
1754 SIMDType xmm8( y.load(j+SIMDSIZE*7UL) );
1756 for(
size_t i=ibegin; i<iend; ++i ) {
1757 const SIMDType x1(
set( x[i] ) );
1758 xmm1 = xmm1 - x1 * A.load(i,j );
1759 xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1760 xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1761 xmm4 = xmm4 - x1 * A.load(i,j+SIMDSIZE*3UL);
1762 xmm5 = xmm5 - x1 * A.load(i,j+SIMDSIZE*4UL);
1763 xmm6 = xmm6 - x1 * A.load(i,j+SIMDSIZE*5UL);
1764 xmm7 = xmm7 - x1 * A.load(i,j+SIMDSIZE*6UL);
1765 xmm8 = xmm8 - x1 * A.load(i,j+SIMDSIZE*7UL);
1768 y.store( j , xmm1 );
1769 y.store( j+SIMDSIZE , xmm2 );
1770 y.store( j+SIMDSIZE*2UL, xmm3 );
1771 y.store( j+SIMDSIZE*3UL, xmm4 );
1772 y.store( j+SIMDSIZE*4UL, xmm5 );
1773 y.store( j+SIMDSIZE*5UL, xmm6 );
1774 y.store( j+SIMDSIZE*6UL, xmm7 );
1775 y.store( j+SIMDSIZE*7UL, xmm8 );
1778 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1780 const size_t ibegin( ( IsLower<MT1>::value )
1781 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1783 const size_t iend( ( IsUpper<MT1>::value )
1784 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1788 SIMDType xmm1( y.load(j ) );
1789 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1790 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1791 SIMDType xmm4( y.load(j+SIMDSIZE*3UL) );
1793 for(
size_t i=ibegin; i<iend; ++i ) {
1794 const SIMDType x1(
set( x[i] ) );
1795 xmm1 = xmm1 - x1 * A.load(i,j );
1796 xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1797 xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1798 xmm4 = xmm4 - x1 * A.load(i,j+SIMDSIZE*3UL);
1801 y.store( j , xmm1 );
1802 y.store( j+SIMDSIZE , xmm2 );
1803 y.store( j+SIMDSIZE*2UL, xmm3 );
1804 y.store( j+SIMDSIZE*3UL, xmm4 );
1807 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
1809 const size_t ibegin( ( IsLower<MT1>::value )
1810 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1812 const size_t iend( ( IsUpper<MT1>::value )
1813 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1817 SIMDType xmm1( y.load(j ) );
1818 SIMDType xmm2( y.load(j+SIMDSIZE ) );
1819 SIMDType xmm3( y.load(j+SIMDSIZE*2UL) );
1821 for(
size_t i=ibegin; i<iend; ++i ) {
1822 const SIMDType x1(
set( x[i] ) );
1823 xmm1 = xmm1 - x1 * A.load(i,j );
1824 xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE );
1825 xmm3 = xmm3 - x1 * A.load(i,j+SIMDSIZE*2UL);
1828 y.store( j , xmm1 );
1829 y.store( j+SIMDSIZE , xmm2 );
1830 y.store( j+SIMDSIZE*2UL, xmm3 );
1833 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
1835 const size_t ibegin( ( IsLower<MT1>::value )
1836 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1838 const size_t iend( ( IsUpper<MT1>::value )
1839 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1843 SIMDType xmm1( y.load(j ) );
1844 SIMDType xmm2( y.load(j+SIMDSIZE) );
1846 for(
size_t i=ibegin; i<iend; ++i ) {
1847 const SIMDType x1(
set( x[i] ) );
1848 xmm1 = xmm1 - x1 * A.load(i,j );
1849 xmm2 = xmm2 - x1 * A.load(i,j+SIMDSIZE);
1852 y.store( j , xmm1 );
1853 y.store( j+SIMDSIZE, xmm2 );
1856 for( ; j<jpos; j+=SIMDSIZE )
1858 const size_t ibegin( ( IsLower<MT1>::value )
1859 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1861 const size_t iend( ( IsUpper<MT1>::value )
1862 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1866 SIMDType xmm1( y.load(j) );
1868 for(
size_t i=ibegin; i<iend; ++i ) {
1869 xmm1 = xmm1 -
set( x[i] ) * A.load(i,j);
1875 for( ; remainder && j<N; ++j )
1877 const size_t ibegin( ( IsLower<MT1>::value )
1878 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1880 const size_t iend( ( IsUpper<MT1>::value )
1881 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
1887 for(
size_t i=ibegin; i<iend; ++i ) {
1888 value += x[i] * A(i,j);
1911 template<
typename VT1
1914 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1915 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1917 selectDefaultSubAssignKernel( y, x, A );
1937 template<
typename VT1
1940 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1> >
1941 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1943 const size_t M( A.rows() );
1944 const size_t N( A.columns() );
1946 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
1948 const size_t jblock( 32768UL /
sizeof( ElementType ) );
1949 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
1953 for(
size_t jj=0U; jj<N; jj+=jblock ) {
1954 for(
size_t ii=0UL; ii<M; ii+=iblock )
1956 const size_t iend(
min( ii+iblock, M ) );
1957 const size_t jtmp(
min( jj+jblock, N ) );
1958 const size_t jend( ( IsLower<MT1>::value )
1959 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
1962 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
1963 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
1965 size_t j( ( IsUpper<MT1>::value )
1966 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
1969 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
1971 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1973 for(
size_t i=ii; i<iend; ++i ) {
1974 const SIMDType x1(
set( x[i] ) );
1975 xmm1 = xmm1 + x1 * A.load(i,j );
1976 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
1977 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
1978 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
1979 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
1980 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
1981 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
1982 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
1985 y.store( j , y.load(j ) - xmm1 );
1986 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
1987 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
1988 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
1989 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5 );
1990 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6 );
1991 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7 );
1992 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8 );
1995 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
1997 SIMDType xmm1, xmm2, xmm3, xmm4;
1999 for(
size_t i=ii; i<iend; ++i ) {
2000 const SIMDType x1(
set( x[i] ) );
2001 xmm1 = xmm1 + x1 * A.load(i,j );
2002 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2003 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2004 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2007 y.store( j , y.load(j ) - xmm1 );
2008 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2009 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2010 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4 );
2013 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2015 SIMDType xmm1, xmm2, xmm3;
2017 for(
size_t i=ii; i<iend; ++i ) {
2018 const SIMDType x1(
set( x[i] ) );
2019 xmm1 = xmm1 + x1 * A.load(i,j );
2020 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2021 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2024 y.store( j , y.load(j ) - xmm1 );
2025 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2 );
2026 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3 );
2029 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
2031 SIMDType xmm1, xmm2;
2033 for(
size_t i=ii; i<iend; ++i ) {
2034 const SIMDType x1(
set( x[i] ) );
2035 xmm1 = xmm1 + x1 * A.load(i,j );
2036 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
2039 y.store( j , y.load(j ) - xmm1 );
2040 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2 );
2043 for( ; j<jpos; j+=SIMDSIZE )
2047 for(
size_t i=ii; i<iend; ++i ) {
2048 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
2051 y.store( j, y.load(j) - xmm1 );
2054 for( ; remainder && j<jend; ++j )
2058 for(
size_t i=ii; i<iend; ++i ) {
2059 value += x[i] * A(i,j);
2084 template<
typename VT1
2087 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1> >
2088 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2090 selectLargeSubAssignKernel( y, x, A );
2096 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2110 template<
typename VT1
2113 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1> >
2114 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2116 typedef ElementType_<VT1> ET;
2118 if( IsTriangular<MT1>::value ) {
2119 ResultType_<VT1> tmp(
serial( x ) );
2120 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2121 subAssign( y, tmp );
2124 gemv( y, x, A, ET(-1), ET(1) );
2148 template<
typename VT1 >
2149 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2159 const ResultType tmp(
serial( rhs ) );
2160 multAssign( ~lhs, tmp );
2182 template<
typename VT1 >
2183 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const TDVecDMatMultExpr& rhs )
2193 const ResultType tmp(
serial( rhs ) );
2194 divAssign( ~lhs, tmp );
2218 template<
typename VT1 >
2219 friend inline EnableIf_< UseSMPAssign<VT1> >
2226 if( rhs.mat_.rows() == 0UL ) {
2230 else if( rhs.mat_.columns() == 0UL ) {
2262 template<
typename VT1 >
2263 friend inline EnableIf_< UseSMPAssign<VT1> >
2274 const ResultType tmp( rhs );
2295 template<
typename VT1 >
2296 friend inline EnableIf_< UseSMPAssign<VT1> >
2303 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2339 template<
typename VT1 >
2340 friend inline EnableIf_< UseSMPAssign<VT1> >
2347 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2383 template<
typename VT1 >
2384 friend inline EnableIf_< UseSMPAssign<VT1> >
2395 const ResultType tmp( rhs );
2420 template<
typename VT1 >
2421 friend inline EnableIf_< UseSMPAssign<VT1> >
2432 const ResultType tmp( rhs );
2471 template<
typename VT
2475 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
2476 ,
private VecScalarMultExpr
2477 ,
private Computation
2481 typedef TDVecDMatMultExpr<VT,MT> VMM;
2482 typedef ResultType_<VMM> RES;
2483 typedef ResultType_<VT>
VRT;
2484 typedef ResultType_<MT>
MRT;
2485 typedef ElementType_<VRT>
VET;
2486 typedef ElementType_<MRT>
MET;
2487 typedef CompositeType_<VT>
VCT;
2488 typedef CompositeType_<MT>
MCT;
2493 enum :
bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2498 enum :
bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2499 IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2507 template<
typename T1 >
2508 struct UseSMPAssign {
2509 enum :
bool { value = ( evaluateVector || evaluateMatrix ) };
2517 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2518 struct UseBlasKernel {
2520 HasMutableDataAccess<T1>::value &&
2521 HasConstDataAccess<T2>::value &&
2522 HasConstDataAccess<T3>::value &&
2523 !IsDiagonal<T3>::value &&
2524 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2525 IsBLASCompatible< ElementType_<T1> >::value &&
2526 IsBLASCompatible< ElementType_<T2> >::value &&
2527 IsBLASCompatible< ElementType_<T3> >::value &&
2528 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2529 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2530 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2539 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2540 struct UseVectorizedDefaultKernel {
2542 !IsDiagonal<T3>::value &&
2543 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2544 AreSIMDCombinable< ElementType_<T1>
2548 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2549 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2555 typedef DVecScalarMultExpr<VMM,ST,true>
This;
2559 typedef SIMDTrait_<ElementType>
SIMDType;
2564 typedef const TDVecDMatMultExpr<VT,MT>
LeftOperand;
2570 typedef IfTrue_< evaluateVector, const VRT, VCT >
LT;
2573 typedef IfTrue_< evaluateMatrix, const MRT, MCT >
RT;
2578 enum :
bool { simdEnabled = !IsDiagonal<MT>::value &&
2579 VT::simdEnabled && MT::simdEnabled &&
2580 AreSIMDCombinable<VET,MET,ST>::value &&
2581 HasSIMDAdd<VET,MET>::value &&
2582 HasSIMDMult<VET,MET>::value };
2585 enum :
bool { smpAssignable = !evaluateVector && VT::smpAssignable &&
2586 !evaluateMatrix && MT::smpAssignable };
2600 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
2612 inline ReturnType
operator[](
size_t index )
const {
2614 return vector_[index] * scalar_;
2625 inline ReturnType
at(
size_t index )
const {
2626 if( index >= vector_.size() ) {
2629 return (*
this)[index];
2638 inline size_t size()
const {
2639 return vector_.size();
2669 template<
typename T >
2670 inline bool canAlias(
const T* alias )
const {
2671 return vector_.canAlias( alias );
2681 template<
typename T >
2682 inline bool isAliased(
const T* alias )
const {
2683 return vector_.isAliased( alias );
2693 return vector_.isAligned();
2703 RightOperand_<VMM> A( vector_.rightOperand() );
2705 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2706 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) ) &&
2707 (
size() > SMP_TDVECDMATMULT_THRESHOLD );
2713 LeftOperand vector_;
2714 RightOperand scalar_;
2729 template<
typename VT1 >
2730 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2736 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
2737 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
2739 if( right.rows() == 0UL ) {
2743 else if( right.columns() == 0UL ) {
2755 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2770 template<
typename VT1
2774 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2776 if( ( IsDiagonal<MT1>::value ) ||
2777 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2778 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
2779 selectSmallAssignKernel( y, x, A, scalar );
2781 selectBlasAssignKernel( y, x, A, scalar );
2799 template<
typename VT1
2803 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2805 const size_t M( A.rows() );
2806 const size_t N( A.columns() );
2808 if( IsStrictlyUpper<MT1>::value ) {
2812 if( !IsLower<MT1>::value )
2814 for(
size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<N; ++j ) {
2815 y[j] = x[0UL] * A(0UL,j);
2819 for(
size_t i=( IsLower<MT1>::value && !IsStrictlyLower<MT1>::value ? 0UL : 1UL ); i<M; ++i )
2821 if( IsDiagonal<MT1>::value )
2823 y[i] = x[i] * A(i,i) * scalar;
2827 const size_t jbegin( ( IsUpper<MT1>::value )
2828 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2830 const size_t jend( ( IsLower<MT1>::value )
2831 ?( IsStrictlyLower<MT1>::value ? i-1UL : i )
2835 const size_t jnum( jend - jbegin );
2836 const size_t jpos( jbegin + ( jnum &
size_t(-2) ) );
2838 for(
size_t j=jbegin; j<jpos; j+=2UL ) {
2839 y[j ] += x[i] * A(i,j );
2840 y[j+1UL] += x[i] * A(i,j+1UL);
2843 y[jpos] += x[i] * A(i,jpos);
2845 if( IsLower<MT1>::value ) {
2846 y[jend] = x[i] * A(i,jend);
2851 if( IsStrictlyLower<MT1>::value ) {
2855 if( !IsDiagonal<MT1>::value )
2857 const size_t iend( IsStrictlyLower<MT1>::value ? N-1UL : N );
2858 for(
size_t j=( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ); j<iend; ++j ) {
2879 template<
typename VT1
2883 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
2884 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2886 selectDefaultAssignKernel( y, x, A, scalar );
2904 template<
typename VT1
2908 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
2909 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2911 const size_t M( A.rows() );
2912 const size_t N( A.columns() );
2914 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
2916 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
2919 const SIMDType factor(
set( scalar ) );
2923 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
2925 const size_t ibegin( ( IsLower<MT1>::value )
2926 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2928 const size_t iend( ( IsUpper<MT1>::value )
2929 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2933 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2935 for(
size_t i=ibegin; i<iend; ++i ) {
2936 const SIMDType x1(
set( x[i] ) );
2937 xmm1 = xmm1 + x1 * A.load(i,j );
2938 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2939 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2940 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2941 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
2942 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
2943 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
2944 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
2947 y.store( j , xmm1*factor );
2948 y.store( j+SIMDSIZE , xmm2*factor );
2949 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2950 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2951 y.store( j+SIMDSIZE*4UL, xmm5*factor );
2952 y.store( j+SIMDSIZE*5UL, xmm6*factor );
2953 y.store( j+SIMDSIZE*6UL, xmm7*factor );
2954 y.store( j+SIMDSIZE*7UL, xmm8*factor );
2957 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
2959 const size_t ibegin( ( IsLower<MT1>::value )
2960 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2962 const size_t iend( ( IsUpper<MT1>::value )
2963 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2967 SIMDType xmm1, xmm2, xmm3, xmm4;
2969 for(
size_t i=ibegin; i<iend; ++i ) {
2970 const SIMDType x1(
set( x[i] ) );
2971 xmm1 = xmm1 + x1 * A.load(i,j );
2972 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2973 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
2974 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
2977 y.store( j , xmm1*factor );
2978 y.store( j+SIMDSIZE , xmm2*factor );
2979 y.store( j+SIMDSIZE*2UL, xmm3*factor );
2980 y.store( j+SIMDSIZE*3UL, xmm4*factor );
2983 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
2985 const size_t ibegin( ( IsLower<MT1>::value )
2986 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2988 const size_t iend( ( IsUpper<MT1>::value )
2989 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
2993 SIMDType xmm1, xmm2, xmm3;
2995 for(
size_t i=ibegin; i<iend; ++i ) {
2996 const SIMDType x1(
set( x[i] ) );
2997 xmm1 = xmm1 + x1 * A.load(i,j );
2998 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
2999 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3002 y.store( j , xmm1*factor );
3003 y.store( j+SIMDSIZE , xmm2*factor );
3004 y.store( j+SIMDSIZE*2UL, xmm3*factor );
3007 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3009 const size_t ibegin( ( IsLower<MT1>::value )
3010 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3012 const size_t iend( ( IsUpper<MT1>::value )
3013 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3017 SIMDType xmm1, xmm2;
3019 for(
size_t i=ibegin; i<iend; ++i ) {
3020 const SIMDType x1(
set( x[i] ) );
3021 xmm1 = xmm1 + x1 * A.load(i,j );
3022 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3025 y.store( j , xmm1*factor );
3026 y.store( j+SIMDSIZE, xmm2*factor );
3029 for( ; j<jpos; j+=SIMDSIZE )
3031 const size_t ibegin( ( IsLower<MT1>::value )
3032 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3034 const size_t iend( ( IsUpper<MT1>::value )
3035 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3041 for(
size_t i=ibegin; i<iend; ++i ) {
3042 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
3045 y.store( j, xmm1*factor );
3048 for( ; remainder && j<N; ++j )
3050 const size_t ibegin( ( IsLower<MT1>::value )
3051 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3053 const size_t iend( ( IsUpper<MT1>::value )
3054 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3060 for(
size_t i=ibegin; i<iend; ++i ) {
3061 value += x[i] * A(i,j);
3064 y[j] = value * scalar;
3083 template<
typename VT1
3087 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3088 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3090 selectDefaultAssignKernel( y, x, A, scalar );
3108 template<
typename VT1
3112 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3113 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3115 const size_t M( A.rows() );
3116 const size_t N( A.columns() );
3118 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3120 const size_t jblock( 32768UL /
sizeof( ElementType ) );
3121 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3123 const SIMDType factor(
set( scalar ) );
3129 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3130 for(
size_t ii=0UL; ii<M; ii+=iblock )
3132 const size_t iend(
min( ii+iblock, M ) );
3133 const size_t jtmp(
min( jj+jblock, N ) );
3134 const size_t jend( ( IsLower<MT1>::value )
3135 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3138 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3139 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3141 size_t j( ( IsUpper<MT1>::value )
3142 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
3145 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t i=ii; i<iend; ++i ) {
3150 const SIMDType x1(
set( x[i] ) );
3151 xmm1 = xmm1 + x1 * A.load(i,j );
3152 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3153 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3154 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3155 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3156 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3157 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3158 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3161 y.store( j , y.load(j ) + xmm1*factor );
3162 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3163 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3164 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3165 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3166 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3167 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3168 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3171 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t i=ii; i<iend; ++i ) {
3176 const SIMDType x1(
set( x[i] ) );
3177 xmm1 = xmm1 + x1 * A.load(i,j );
3178 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3179 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3180 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3183 y.store( j , y.load(j ) + xmm1*factor );
3184 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3185 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3186 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3189 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t i=ii; i<iend; ++i ) {
3194 const SIMDType x1(
set( x[i] ) );
3195 xmm1 = xmm1 + x1 * A.load(i,j );
3196 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3197 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3200 y.store( j , y.load(j ) + xmm1*factor );
3201 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3202 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3205 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3207 SIMDType xmm1, xmm2;
3209 for(
size_t i=ii; i<iend; ++i ) {
3210 const SIMDType x1(
set( x[i] ) );
3211 xmm1 = xmm1 + x1 * A.load(i,j );
3212 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3215 y.store( j , y.load(j ) + xmm1*factor );
3216 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3219 for( ; j<jpos; j+=SIMDSIZE )
3223 for(
size_t i=ii; i<iend; ++i ) {
3224 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
3227 y.store( j, y.load(j) + xmm1*factor );
3230 for( ; remainder && j<jend; ++j )
3234 for(
size_t i=ii; i<iend; ++i ) {
3235 value += x[i] * A(i,j);
3238 y[j] += value * scalar;
3258 template<
typename VT1
3262 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3263 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3265 selectLargeAssignKernel( y, x, A, scalar );
3270 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3284 template<
typename VT1
3288 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3289 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3291 typedef ElementType_<VT1> ET;
3293 if( IsTriangular<MT1>::value ) {
3294 assign( y, scalar * x );
3295 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3298 gemv( y, x, A, ET(scalar), ET(0) );
3316 template<
typename VT1 >
3317 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3327 const ResultType tmp(
serial( rhs ) );
3328 assign( ~lhs, tmp );
3344 template<
typename VT1 >
3345 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3351 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3352 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3354 if( right.rows() == 0UL || right.columns() == 0UL ) {
3366 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3381 template<
typename VT1
3385 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3387 if( ( IsDiagonal<MT1>::value ) ||
3388 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3389 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3390 selectSmallAddAssignKernel( y, x, A, scalar );
3392 selectBlasAddAssignKernel( y, x, A, scalar );
3410 template<
typename VT1
3414 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3416 y.addAssign( x * A * scalar );
3434 template<
typename VT1
3438 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3439 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3441 selectDefaultAddAssignKernel( y, x, A, scalar );
3460 template<
typename VT1
3464 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3465 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3467 const size_t M( A.rows() );
3468 const size_t N( A.columns() );
3470 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3472 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
3475 const SIMDType factor(
set( scalar ) );
3479 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3481 const size_t ibegin( ( IsLower<MT1>::value )
3482 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3484 const size_t iend( ( IsUpper<MT1>::value )
3485 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3489 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3491 for(
size_t i=ibegin; i<iend; ++i ) {
3492 const SIMDType x1(
set( x[i] ) );
3493 xmm1 = xmm1 + x1 * A.load(i,j );
3494 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3495 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3496 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3497 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3498 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3499 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3500 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3503 y.store( j , y.load(j ) + xmm1*factor );
3504 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3505 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3506 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3507 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3508 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3509 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3510 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3513 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3515 const size_t ibegin( ( IsLower<MT1>::value )
3516 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3518 const size_t iend( ( IsUpper<MT1>::value )
3519 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3523 SIMDType xmm1, xmm2, xmm3, xmm4;
3525 for(
size_t i=ibegin; i<iend; ++i ) {
3526 const SIMDType x1(
set( x[i] ) );
3527 xmm1 = xmm1 + x1 * A.load(i,j );
3528 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3529 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3530 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3533 y.store( j , y.load(j ) + xmm1*factor );
3534 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3535 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3536 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3539 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3541 const size_t ibegin( ( IsLower<MT1>::value )
3542 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3544 const size_t iend( ( IsUpper<MT1>::value )
3545 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3549 SIMDType xmm1, xmm2, xmm3;
3551 for(
size_t i=ibegin; i<iend; ++i ) {
3552 const SIMDType x1(
set( x[i] ) );
3553 xmm1 = xmm1 + x1 * A.load(i,j );
3554 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3555 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3558 y.store( j , y.load(j ) + xmm1*factor );
3559 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3560 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3563 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3565 const size_t ibegin( ( IsLower<MT1>::value )
3566 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3568 const size_t iend( ( IsUpper<MT1>::value )
3569 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3573 SIMDType xmm1, xmm2;
3575 for(
size_t i=ibegin; i<iend; ++i ) {
3576 const SIMDType x1(
set( x[i] ) );
3577 xmm1 = xmm1 + x1 * A.load(i,j );
3578 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3581 y.store( j , y.load(j ) + xmm1*factor );
3582 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3585 for( ; j<jpos; j+=SIMDSIZE )
3587 const size_t ibegin( ( IsLower<MT1>::value )
3588 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3590 const size_t iend( ( IsUpper<MT1>::value )
3591 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3597 for(
size_t i=ibegin; i<iend; ++i ) {
3598 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
3601 y.store( j, y.load(j) + xmm1*factor );
3604 for( ; remainder && j<N; ++j )
3606 const size_t ibegin( ( IsLower<MT1>::value )
3607 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
3609 const size_t iend( ( IsUpper<MT1>::value )
3610 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
3616 for(
size_t i=ibegin; i<iend; ++i ) {
3617 value += x[i] * A(i,j);
3620 y[j] += value * scalar;
3639 template<
typename VT1
3643 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3644 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3646 selectDefaultAddAssignKernel( y, x, A, scalar );
3665 template<
typename VT1
3669 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3670 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3672 const size_t M( A.rows() );
3673 const size_t N( A.columns() );
3675 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
3677 const size_t jblock( 32768UL /
sizeof( ElementType ) );
3678 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
3680 const SIMDType factor(
set( scalar ) );
3684 for(
size_t jj=0U; jj<N; jj+=jblock ) {
3685 for(
size_t ii=0UL; ii<M; ii+=iblock )
3687 const size_t iend(
min( ii+iblock, M ) );
3688 const size_t jtmp(
min( jj+jblock, N ) );
3689 const size_t jend( ( IsLower<MT1>::value )
3690 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
3693 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
3694 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
3696 size_t j( ( IsUpper<MT1>::value )
3697 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
3700 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
3702 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3704 for(
size_t i=ii; i<iend; ++i ) {
3705 const SIMDType x1(
set( x[i] ) );
3706 xmm1 = xmm1 + x1 * A.load(i,j );
3707 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3708 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3709 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3710 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
3711 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
3712 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
3713 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
3716 y.store( j , y.load(j ) + xmm1*factor );
3717 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3718 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3719 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3720 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) + xmm5*factor );
3721 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) + xmm6*factor );
3722 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) + xmm7*factor );
3723 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) + xmm8*factor );
3726 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
3728 SIMDType xmm1, xmm2, xmm3, xmm4;
3730 for(
size_t i=ii; i<iend; ++i ) {
3731 const SIMDType x1(
set( x[i] ) );
3732 xmm1 = xmm1 + x1 * A.load(i,j );
3733 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3734 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3735 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
3738 y.store( j , y.load(j ) + xmm1*factor );
3739 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3740 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3741 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) + xmm4*factor );
3744 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
3746 SIMDType xmm1, xmm2, xmm3;
3748 for(
size_t i=ii; i<iend; ++i ) {
3749 const SIMDType x1(
set( x[i] ) );
3750 xmm1 = xmm1 + x1 * A.load(i,j );
3751 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
3752 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
3755 y.store( j , y.load(j ) + xmm1*factor );
3756 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) + xmm2*factor );
3757 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) + xmm3*factor );
3760 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
3762 SIMDType xmm1, xmm2;
3764 for(
size_t i=ii; i<iend; ++i ) {
3765 const SIMDType x1(
set( x[i] ) );
3766 xmm1 = xmm1 + x1 * A.load(i,j );
3767 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
3770 y.store( j , y.load(j ) + xmm1*factor );
3771 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) + xmm2*factor );
3774 for( ; j<jpos; j+=SIMDSIZE )
3778 for(
size_t i=ii; i<iend; ++i ) {
3779 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
3782 y.store( j, y.load(j) + xmm1*factor );
3785 for( ; remainder && j<jend; ++j )
3789 for(
size_t i=ii; i<iend; ++i ) {
3790 value += x[i] * A(i,j);
3793 y[j] += value * scalar;
3814 template<
typename VT1
3818 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3819 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3821 selectLargeAddAssignKernel( y, x, A, scalar );
3826 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3840 template<
typename VT1
3844 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
3845 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3847 typedef ElementType_<VT1> ET;
3849 if( IsTriangular<MT1>::value ) {
3850 ResultType_<VT1> tmp(
serial( scalar * x ) );
3851 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3852 addAssign( y, tmp );
3855 gemv( y, x, A, ET(scalar), ET(1) );
3877 template<
typename VT1 >
3878 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
3884 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
3885 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
3887 if( right.rows() == 0UL || right.columns() == 0UL ) {
3899 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
3914 template<
typename VT1
3918 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3920 if( ( IsDiagonal<MT1>::value ) ||
3921 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3922 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3923 selectSmallSubAssignKernel( y, x, A, scalar );
3925 selectBlasSubAssignKernel( y, x, A, scalar );
3943 template<
typename VT1
3947 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3949 y.subAssign( x * A * scalar );
3967 template<
typename VT1
3971 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3972 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3974 selectDefaultSubAssignKernel( y, x, A, scalar );
3993 template<
typename VT1
3997 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
3998 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4000 const size_t M( A.rows() );
4001 const size_t N( A.columns() );
4003 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4005 const size_t jpos( remainder ? ( N &
size_t(-SIMDSIZE) ) : N );
4008 const SIMDType factor(
set( scalar ) );
4012 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4014 const size_t ibegin( ( IsLower<MT1>::value )
4015 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4017 const size_t iend( ( IsUpper<MT1>::value )
4018 ?(
min( j+SIMDSIZE*8UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4022 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4024 for(
size_t i=ibegin; i<iend; ++i ) {
4025 const SIMDType x1(
set( x[i] ) );
4026 xmm1 = xmm1 + x1 * A.load(i,j );
4027 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4028 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4029 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4030 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
4031 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
4032 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
4033 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
4036 y.store( j , y.load(j ) - xmm1*factor );
4037 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4038 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4039 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4040 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4041 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4042 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4043 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4046 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4048 const size_t ibegin( ( IsLower<MT1>::value )
4049 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4051 const size_t iend( ( IsUpper<MT1>::value )
4052 ?(
min( j+SIMDSIZE*4UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4056 SIMDType xmm1, xmm2, xmm3, xmm4;
4058 for(
size_t i=ibegin; i<iend; ++i ) {
4059 const SIMDType x1(
set( x[i] ) );
4060 xmm1 = xmm1 + x1 * A.load(i,j );
4061 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4062 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4063 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4066 y.store( j , y.load(j ) - xmm1*factor );
4067 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4068 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4069 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4072 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4074 const size_t ibegin( ( IsLower<MT1>::value )
4075 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4077 const size_t iend( ( IsUpper<MT1>::value )
4078 ?(
min( j+SIMDSIZE*3UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4082 SIMDType xmm1, xmm2, xmm3;
4084 for(
size_t i=ibegin; i<iend; ++i ) {
4085 const SIMDType x1(
set( x[i] ) );
4086 xmm1 = xmm1 + x1 * A.load(i,j );
4087 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4088 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4091 y.store( j , y.load(j ) - xmm1*factor );
4092 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4093 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4096 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4098 const size_t ibegin( ( IsLower<MT1>::value )
4099 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4101 const size_t iend( ( IsUpper<MT1>::value )
4102 ?(
min( j+SIMDSIZE*2UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4106 SIMDType xmm1, xmm2;
4108 for(
size_t i=ibegin; i<iend; ++i ) {
4109 const SIMDType x1(
set( x[i] ) );
4110 xmm1 = xmm1 + x1 * A.load(i,j );
4111 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
4114 y.store( j , y.load(j ) - xmm1*factor );
4115 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4118 for( ; j<jpos; j+=SIMDSIZE )
4120 const size_t ibegin( ( IsLower<MT1>::value )
4121 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4123 const size_t iend( ( IsUpper<MT1>::value )
4124 ?(
min( j+SIMDSIZE, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4130 for(
size_t i=ibegin; i<iend; ++i ) {
4131 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
4134 y.store( j, y.load(j) - xmm1*factor );
4137 for( ; remainder && j<N; ++j )
4139 const size_t ibegin( ( IsLower<MT1>::value )
4140 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
4142 const size_t iend( ( IsUpper<MT1>::value )
4143 ?(
min( j+1UL, M ) - ( IsStrictlyUpper<MT1>::value ? 1UL : 0UL ) )
4149 for(
size_t i=ibegin; i<iend; ++i ) {
4150 value += x[i] * A(i,j);
4153 y[j] -= value * scalar;
4172 template<
typename VT1
4176 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4177 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4179 selectDefaultSubAssignKernel( y, x, A, scalar );
4198 template<
typename VT1
4202 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >
4203 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4205 const size_t M( A.rows() );
4206 const size_t N( A.columns() );
4208 const bool remainder( !IsPadded<VT1>::value || !IsPadded<MT1>::value );
4210 const size_t jblock( 32768UL /
sizeof( ElementType ) );
4211 const size_t iblock( ( N < jblock )?( 8UL ):( 4UL ) );
4213 const SIMDType factor(
set( scalar ) );
4217 for(
size_t jj=0U; jj<N; jj+=jblock ) {
4218 for(
size_t ii=0UL; ii<M; ii+=iblock )
4220 const size_t iend(
min( ii+iblock, M ) );
4221 const size_t jtmp(
min( jj+jblock, N ) );
4222 const size_t jend( ( IsLower<MT1>::value )
4223 ?(
min( jtmp, ( IsStrictlyLower<MT1>::value ? iend-1UL : iend ) ) )
4226 const size_t jpos( remainder ? ( jend &
size_t(-SIMDSIZE) ) : jend );
4227 BLAZE_INTERNAL_ASSERT( !remainder || ( jend - ( jend % SIMDSIZE ) ) == jpos,
"Invalid end calculation" );
4229 size_t j( ( IsUpper<MT1>::value )
4230 ?(
max( jj, ( IsStrictlyUpper<MT1>::value ? ii+1UL : ii ) &
size_t(-SIMDSIZE) ) )
4233 for( ; (j+SIMDSIZE*7UL) < jpos; j+=SIMDSIZE*8UL )
4235 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4237 for(
size_t i=ii; i<iend; ++i ) {
4238 const SIMDType x1(
set( x[i] ) );
4239 xmm1 = xmm1 + x1 * A.load(i,j );
4240 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4241 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4242 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4243 xmm5 = xmm5 + x1 * A.load(i,j+SIMDSIZE*4UL);
4244 xmm6 = xmm6 + x1 * A.load(i,j+SIMDSIZE*5UL);
4245 xmm7 = xmm7 + x1 * A.load(i,j+SIMDSIZE*6UL);
4246 xmm8 = xmm8 + x1 * A.load(i,j+SIMDSIZE*7UL);
4249 y.store( j , y.load(j ) - xmm1*factor );
4250 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4251 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4252 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4253 y.store( j+SIMDSIZE*4UL, y.load(j+SIMDSIZE*4UL) - xmm5*factor );
4254 y.store( j+SIMDSIZE*5UL, y.load(j+SIMDSIZE*5UL) - xmm6*factor );
4255 y.store( j+SIMDSIZE*6UL, y.load(j+SIMDSIZE*6UL) - xmm7*factor );
4256 y.store( j+SIMDSIZE*7UL, y.load(j+SIMDSIZE*7UL) - xmm8*factor );
4259 for( ; (j+SIMDSIZE*3UL) < jpos; j+=SIMDSIZE*4UL )
4261 SIMDType xmm1, xmm2, xmm3, xmm4;
4263 for(
size_t i=ii; i<iend; ++i ) {
4264 const SIMDType x1(
set( x[i] ) );
4265 xmm1 = xmm1 + x1 * A.load(i,j );
4266 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4267 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4268 xmm4 = xmm4 + x1 * A.load(i,j+SIMDSIZE*3UL);
4271 y.store( j , y.load(j ) - xmm1*factor );
4272 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4273 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4274 y.store( j+SIMDSIZE*3UL, y.load(j+SIMDSIZE*3UL) - xmm4*factor );
4277 for( ; (j+SIMDSIZE*2UL) < jpos; j+=SIMDSIZE*3UL )
4279 SIMDType xmm1, xmm2, xmm3;
4281 for(
size_t i=ii; i<iend; ++i ) {
4282 const SIMDType x1(
set( x[i] ) );
4283 xmm1 = xmm1 + x1 * A.load(i,j );
4284 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE );
4285 xmm3 = xmm3 + x1 * A.load(i,j+SIMDSIZE*2UL);
4288 y.store( j , y.load(j ) - xmm1*factor );
4289 y.store( j+SIMDSIZE , y.load(j+SIMDSIZE ) - xmm2*factor );
4290 y.store( j+SIMDSIZE*2UL, y.load(j+SIMDSIZE*2UL) - xmm3*factor );
4293 for( ; (j+SIMDSIZE) < jpos; j+=SIMDSIZE*2UL )
4295 SIMDType xmm1, xmm2;
4297 for(
size_t i=ii; i<iend; ++i ) {
4298 const SIMDType x1(
set( x[i] ) );
4299 xmm1 = xmm1 + x1 * A.load(i,j );
4300 xmm2 = xmm2 + x1 * A.load(i,j+SIMDSIZE);
4303 y.store( j , y.load(j ) - xmm1*factor );
4304 y.store( j+SIMDSIZE, y.load(j+SIMDSIZE) - xmm2*factor );
4307 for( ; j<jpos; j+=SIMDSIZE )
4311 for(
size_t i=ii; i<iend; ++i ) {
4312 xmm1 = xmm1 +
set( x[i] ) * A.load(i,j);
4315 y.store( j, y.load(j) - xmm1*factor );
4318 for( ; remainder && j<jend; ++j )
4322 for(
size_t i=ii; i<iend; ++i ) {
4323 value += x[i] * A(i,j);
4326 y[j] -= value * scalar;
4347 template<
typename VT1
4351 static inline DisableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4352 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4354 selectLargeSubAssignKernel( y, x, A, scalar );
4359 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4373 template<
typename VT1
4377 static inline EnableIf_< UseBlasKernel<VT1,VT2,MT1,ST2> >
4378 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4380 typedef ElementType_<VT1> ET;
4382 if( IsTriangular<MT1>::value ) {
4383 ResultType_<VT1> tmp(
serial( scalar * x ) );
4384 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4385 subAssign( y, tmp );
4388 gemv( y, x, A, ET(-scalar), ET(1) );
4410 template<
typename VT1 >
4411 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4421 const ResultType tmp(
serial( rhs ) );
4422 multAssign( ~lhs, tmp );
4442 template<
typename VT1 >
4443 friend inline void divAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4453 const ResultType tmp(
serial( rhs ) );
4454 divAssign( ~lhs, tmp );
4476 template<
typename VT1 >
4477 friend inline EnableIf_< UseSMPAssign<VT1> >
4478 smpAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4484 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4485 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4487 if( right.rows() == 0UL ) {
4491 else if( right.columns() == 0UL ) {
4521 template<
typename VT1 >
4522 friend inline EnableIf_< UseSMPAssign<VT1> >
4523 smpAssign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4533 const ResultType tmp( rhs );
4552 template<
typename VT1 >
4553 friend inline EnableIf_< UseSMPAssign<VT1> >
4554 smpAddAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4560 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4561 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4563 if( right.rows() == 0UL || right.columns() == 0UL ) {
4597 template<
typename VT1 >
4598 friend inline EnableIf_< UseSMPAssign<VT1> >
4599 smpSubAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4605 LeftOperand_<VMM> left ( rhs.vector_.leftOperand() );
4606 RightOperand_<VMM> right( rhs.vector_.rightOperand() );
4608 if( right.rows() == 0UL || right.columns() == 0UL ) {
4643 template<
typename VT1 >
4644 friend inline EnableIf_< UseSMPAssign<VT1> >
4645 smpMultAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4655 const ResultType tmp( rhs );
4678 template<
typename VT1 >
4679 friend inline EnableIf_< UseSMPAssign<VT1> >
4680 smpDivAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
4690 const ResultType tmp( rhs );
4753 template<
typename T1
4755 inline const DisableIf_< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >
4760 if( (~vec).
size() != (~mat).
rows() ) {
4790 template<
typename T1
4793 inline const EnableIf_< IsMatMatMultExpr<T2>, MultExprTrait_<T1,T2> >
4815 template<
typename VT,
typename MT >
4832 template<
typename VT,
typename MT >
4834 :
public BoolConstant< And< IsAligned<VT>, IsAligned<MT> >::value >
4850 template<
typename VT,
typename MT,
bool AF >
4855 using Type = MultExprTrait_< SubvectorExprTrait_<const VT,AF>
4856 , SubmatrixExprTrait_<const MT,AF> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:213
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
Header file for basic type definitions.
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDVecDMatMultExpr.h:211
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecDMatMultExpr.h:385
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
RightOperand rightOperand() const noexcept
Returns the right-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:331
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
ResultType_< MT > MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:133
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, ColumnExprTrait_< MT > > column(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific column of the given matrix.
Definition: Column.h:126
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecDMatMultExpr.h:214
Header file for the IsComplexDouble type trait.
CompositeType_< VT > VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:136
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecDMatMultExpr.h:266
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
ElementType_< MRT > MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:135
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
If_< IsExpression< VT >, const VT, const VT & > LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:217
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
If_< IsExpression< MT >, const MT, const MT & > RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:220
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
Header file for the Columns type trait.
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDVecDMatMultExpr.h:355
CompositeType_< MT > MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecDMatMultExpr.h:137
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
Constraint on the data type.
ElementType_< VRT > VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecDMatMultExpr.h:134
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecDMatMultExpr.h:386
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
TDVecDMatMultExpr(const VT &vec, const MT &mat) noexcept
Constructor for the TDVecDMatMultExpr class.
Definition: TDVecDMatMultExpr.h:252
TDVecDMatMultExpr< VT, MT > This
Type of this TDVecDMatMultExpr instance.
Definition: TDVecDMatMultExpr.h:208
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:79
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:61
Header file for the HasSIMDMult type trait.
Header file for run time assertion macros.
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDVecDMatMultExpr.h:212
LeftOperand leftOperand() const noexcept
Returns the left-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:321
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraint on the data type.
IfTrue_< evaluateMatrix, const MRT, MCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecDMatMultExpr.h:226
Header file for the TVecMatMultExpr base class.
Constraint on the data type.
Expression object for transpose dense vector-dense matrix multiplications.The TDVecDMatMultExpr class...
Definition: Forward.h:136
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecDMatMultExpr.h:298
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of rows of the matrix.
Definition: Matrix.h:314
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:110
Header file for the AreSIMDCombinable type trait.
MultTrait_< VRT, MRT > ResultType
Result type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:209
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
IfTrue_< evaluateVector, const VRT, VCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecDMatMultExpr.h:223
ResultType_< VT > VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecDMatMultExpr.h:132
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDVecDMatMultExpr.h:210
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDVecDMatMultExpr.h:311
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDVecDMatMultExpr.h:343
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:61
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDVecDMatMultExpr.h:375
Header file for the FunctionTrace class.
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecDMatMultExpr.h:365