35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
123 template<
typename MT
125 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
126 ,
private MatVecMultExpr
127 ,
private Computation
156 template<
typename T1 >
157 struct UseSMPAssign {
158 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
168 template<
typename T1,
typename T2,
typename T3 >
169 struct UseBlasKernel {
175 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
191 template<
typename T1,
typename T2,
typename T3 >
192 struct UseVectorizedDefaultKernel {
195 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
231 MT::simdEnabled && VT::simdEnabled &&
236 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
237 !evaluateVector && VT::smpAssignable };
270 return mat_(index,index) *
vec_[index];
280 const size_t n (
mat_.columns() -
begin );
297 inline ReturnType
at(
size_t index )
const {
298 if( index >=
mat_.rows() ) {
301 return (*
this)[index];
310 inline size_t size() const noexcept {
341 template<
typename T >
342 inline bool canAlias(
const T* alias )
const noexcept {
343 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
353 template<
typename T >
354 inline bool isAliased(
const T* alias )
const noexcept {
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 return mat_.isAligned() &&
vec_.isAligned();
377 (
mat_.rows() *
mat_.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
378 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
401 template<
typename VT1 >
408 if( rhs.mat_.rows() == 0UL ) {
411 else if( rhs.mat_.columns() == 0UL ) {
416 LT A(
serial( rhs.mat_ ) );
417 RT x(
serial( rhs.vec_ ) );
424 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
440 template<
typename VT1
443 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
447 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
448 selectSmallAssignKernel( y, A, x );
450 selectBlasAssignKernel( y, A, x );
469 template<
typename VT1
472 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
474 const size_t M( A.rows() );
475 const size_t N( A.columns() );
477 if( IsStrictlyLower<MT1>::value ) {
481 if( !IsUpper<MT1>::value )
483 for(
size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<M; ++i ) {
484 y[i] = A(i,0UL) * x[0UL];
488 for(
size_t j=( IsUpper<MT1>::value && !IsStrictlyUpper<MT1>::value ? 0UL : 1UL ); j<N; ++j )
490 if( IsDiagonal<MT1>::value )
492 y[j] = A(j,j) * x[j];
496 const size_t ibegin( ( IsLower<MT1>::value )
497 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
499 const size_t iend( ( IsUpper<MT1>::value )
500 ?( IsStrictlyUpper<MT1>::value ? j-1UL : j )
504 const size_t inum( iend - ibegin );
505 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
507 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
508 y[i ] += A(i ,j) * x[j];
509 y[i+1UL] += A(i+1UL,j) * x[j];
512 y[ipos] += A(ipos,j) * x[j];
514 if( IsUpper<MT1>::value ) {
515 y[iend] = A(iend,j) * x[j];
520 if( IsStrictlyUpper<MT1>::value ) {
541 template<
typename VT1
544 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
545 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
547 selectDefaultAssignKernel( y, A, x );
566 template<
typename VT1
569 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
570 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
572 const size_t M( A.rows() );
573 const size_t N( A.columns() );
575 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
577 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
582 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
584 const size_t jbegin( ( IsUpper<MT1>::value )
585 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
587 const size_t jend( ( IsLower<MT1>::value )
588 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
592 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
594 for(
size_t j=jbegin; j<jend; ++j ) {
595 const SIMDType x1(
set( x[j] ) );
596 xmm1 = xmm1 + A.load(i ,j) * x1;
597 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
598 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
599 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
600 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
601 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
602 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
603 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
607 y.store( i+SIMDSIZE , xmm2 );
608 y.store( i+SIMDSIZE*2UL, xmm3 );
609 y.store( i+SIMDSIZE*3UL, xmm4 );
610 y.store( i+SIMDSIZE*4UL, xmm5 );
611 y.store( i+SIMDSIZE*5UL, xmm6 );
612 y.store( i+SIMDSIZE*6UL, xmm7 );
613 y.store( i+SIMDSIZE*7UL, xmm8 );
616 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
618 const size_t jbegin( ( IsUpper<MT1>::value )
619 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
621 const size_t jend( ( IsLower<MT1>::value )
622 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
626 SIMDType xmm1, xmm2, xmm3, xmm4;
628 for(
size_t j=jbegin; j<jend; ++j ) {
629 const SIMDType x1(
set( x[j] ) );
630 xmm1 = xmm1 + A.load(i ,j) * x1;
631 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
632 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
633 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
637 y.store( i+SIMDSIZE , xmm2 );
638 y.store( i+SIMDSIZE*2UL, xmm3 );
639 y.store( i+SIMDSIZE*3UL, xmm4 );
642 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
644 const size_t jbegin( ( IsUpper<MT1>::value )
645 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
647 const size_t jend( ( IsLower<MT1>::value )
648 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
652 SIMDType xmm1, xmm2, xmm3;
654 for(
size_t j=jbegin; j<jend; ++j ) {
655 const SIMDType x1(
set( x[j] ) );
656 xmm1 = xmm1 + A.load(i ,j) * x1;
657 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
658 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
662 y.store( i+SIMDSIZE , xmm2 );
663 y.store( i+SIMDSIZE*2UL, xmm3 );
666 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
668 const size_t jbegin( ( IsUpper<MT1>::value )
669 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
671 const size_t jend( ( IsLower<MT1>::value )
672 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
678 for(
size_t j=jbegin; j<jend; ++j ) {
679 const SIMDType x1(
set( x[j] ) );
680 xmm1 = xmm1 + A.load(i ,j) * x1;
681 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
685 y.store( i+SIMDSIZE, xmm2 );
688 for( ; i<ipos; i+=SIMDSIZE )
690 const size_t jbegin( ( IsUpper<MT1>::value )
691 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
693 const size_t jend( ( IsLower<MT1>::value )
694 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
700 for(
size_t j=jbegin; j<jend; ++j ) {
701 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
707 for( ; remainder && i<M; ++i )
709 const size_t jbegin( ( IsUpper<MT1>::value )
710 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
712 const size_t jend( ( IsLower<MT1>::value )
713 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
719 for(
size_t j=jbegin; j<jend; ++j ) {
720 value += A(i,j) * x[j];
743 template<
typename VT1
746 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
747 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
749 selectDefaultAssignKernel( y, A, x );
768 template<
typename VT1
771 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
772 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
774 const size_t M( A.rows() );
775 const size_t N( A.columns() );
777 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
779 const size_t iblock( 32768UL /
sizeof( ElementType ) );
780 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
786 for(
size_t ii=0U; ii<M; ii+=iblock ) {
787 for(
size_t jj=0UL; jj<N; jj+=jblock )
789 const size_t jend(
min( jj+jblock, N ) );
790 const size_t itmp(
min( ii+iblock, M ) );
791 const size_t iend( ( IsUpper<MT1>::value )
792 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
795 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
796 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
798 size_t i( ( IsLower<MT1>::value )
799 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
802 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
804 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
806 for(
size_t j=jj; j<jend; ++j ) {
807 const SIMDType x1(
set( x[j] ) );
808 xmm1 = xmm1 + A.load(i ,j) * x1;
809 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
810 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
811 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
812 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
813 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
814 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
815 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
818 y.store( i , y.load(i ) + xmm1 );
819 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
820 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
821 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
822 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
823 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
824 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
825 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
828 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
830 SIMDType xmm1, xmm2, xmm3, xmm4;
832 for(
size_t j=jj; j<jend; ++j ) {
833 const SIMDType x1(
set( x[j] ) );
834 xmm1 = xmm1 + A.load(i ,j) * x1;
835 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
836 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
837 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
840 y.store( i , y.load(i ) + xmm1 );
841 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
842 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
843 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
846 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
848 SIMDType xmm1, xmm2, xmm3;
850 for(
size_t j=jj; j<jend; ++j ) {
851 const SIMDType x1(
set( x[j] ) );
852 xmm1 = xmm1 + A.load(i ,j) * x1;
853 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
854 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
857 y.store( i , y.load(i ) + xmm1 );
858 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
859 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
862 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
866 for(
size_t j=jj; j<jend; ++j ) {
867 const SIMDType x1(
set( x[j] ) );
868 xmm1 = xmm1 + A.load(i ,j) * x1;
869 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
872 y.store( i , y.load(i ) + xmm1 );
873 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
876 for( ; i<ipos; i+=SIMDSIZE )
880 for(
size_t j=jj; j<jend; ++j ) {
881 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
884 y.store( i, y.load(i) + xmm1 );
887 for( ; remainder && i<iend; ++i )
891 for(
size_t j=jj; j<jend; ++j ) {
892 value += A(i,j) * x[j];
917 template<
typename VT1
920 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
921 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
923 selectLargeAssignKernel( y, A, x );
929 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
943 template<
typename VT1
946 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
947 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
949 typedef ElementType_<VT1> ET;
951 if( IsTriangular<MT1>::value ) {
953 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
956 gemv( y, A, x, ET(1), ET(0) );
976 template<
typename VT1 >
977 friend inline void assign( SparseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
987 const ResultType tmp(
serial( rhs ) );
1006 template<
typename VT1 >
1007 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1013 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1017 LT A(
serial( rhs.mat_ ) );
1018 RT x(
serial( rhs.vec_ ) );
1025 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1041 template<
typename VT1
1044 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1046 if( ( IsDiagonal<MT1>::value ) ||
1047 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1048 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1049 selectSmallAddAssignKernel( y, A, x );
1051 selectBlasAddAssignKernel( y, A, x );
1070 template<
typename VT1
1073 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1075 const size_t M( A.rows() );
1076 const size_t N( A.columns() );
1078 for(
size_t j=0UL; j<N; ++j )
1080 if( IsDiagonal<MT1>::value )
1082 y[j] += A(j,j) * x[j];
1086 const size_t ibegin( ( IsLower<MT1>::value )
1087 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1089 const size_t iend( ( IsUpper<MT1>::value )
1090 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1094 const size_t inum( iend - ibegin );
1095 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1097 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1098 y[i ] += A(i ,j) * x[j];
1099 y[i+1UL] += A(i+1UL,j) * x[j];
1102 y[ipos] += A(ipos,j) * x[j];
1124 template<
typename VT1
1127 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1128 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1130 selectDefaultAddAssignKernel( y, A, x );
1149 template<
typename VT1
1152 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1153 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1155 const size_t M( A.rows() );
1156 const size_t N( A.columns() );
1158 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1160 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1165 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1167 const size_t jbegin( ( IsUpper<MT1>::value )
1168 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1170 const size_t jend( ( IsLower<MT1>::value )
1171 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1175 SIMDType xmm1( y.load(i ) );
1176 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1177 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1178 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1179 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1180 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1181 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1182 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1184 for(
size_t j=jbegin; j<jend; ++j ) {
1185 const SIMDType x1(
set( x[j] ) );
1186 xmm1 = xmm1 + A.load(i ,j) * x1;
1187 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1188 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1189 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1190 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1191 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1192 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1193 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1196 y.store( i , xmm1 );
1197 y.store( i+SIMDSIZE , xmm2 );
1198 y.store( i+SIMDSIZE*2UL, xmm3 );
1199 y.store( i+SIMDSIZE*3UL, xmm4 );
1200 y.store( i+SIMDSIZE*4UL, xmm5 );
1201 y.store( i+SIMDSIZE*5UL, xmm6 );
1202 y.store( i+SIMDSIZE*6UL, xmm7 );
1203 y.store( i+SIMDSIZE*7UL, xmm8 );
1206 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1208 const size_t jbegin( ( IsUpper<MT1>::value )
1209 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1211 const size_t jend( ( IsLower<MT1>::value )
1212 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1216 SIMDType xmm1( y.load(i ) );
1217 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1218 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1219 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1221 for(
size_t j=jbegin; j<jend; ++j ) {
1222 const SIMDType x1(
set( x[j] ) );
1223 xmm1 = xmm1 + A.load(i ,j) * x1;
1224 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1225 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1226 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1229 y.store( i , xmm1 );
1230 y.store( i+SIMDSIZE , xmm2 );
1231 y.store( i+SIMDSIZE*2UL, xmm3 );
1232 y.store( i+SIMDSIZE*3UL, xmm4 );
1235 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1237 const size_t jbegin( ( IsUpper<MT1>::value )
1238 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1240 const size_t jend( ( IsLower<MT1>::value )
1241 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1245 SIMDType xmm1( y.load(i ) );
1246 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1247 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1249 for(
size_t j=jbegin; j<jend; ++j ) {
1250 const SIMDType x1(
set( x[j] ) );
1251 xmm1 = xmm1 + A.load(i ,j) * x1;
1252 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1253 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1256 y.store( i , xmm1 );
1257 y.store( i+SIMDSIZE , xmm2 );
1258 y.store( i+SIMDSIZE*2UL, xmm3 );
1261 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1263 const size_t jbegin( ( IsUpper<MT1>::value )
1264 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1266 const size_t jend( ( IsLower<MT1>::value )
1267 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1271 SIMDType xmm1( y.load(i ) );
1272 SIMDType xmm2( y.load(i+SIMDSIZE) );
1274 for(
size_t j=jbegin; j<jend; ++j ) {
1275 const SIMDType x1(
set( x[j] ) );
1276 xmm1 = xmm1 + A.load(i ,j) * x1;
1277 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
1280 y.store( i , xmm1 );
1281 y.store( i+SIMDSIZE, xmm2 );
1284 for( ; i<ipos; i+=SIMDSIZE )
1286 const size_t jbegin( ( IsUpper<MT1>::value )
1287 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1289 const size_t jend( ( IsLower<MT1>::value )
1290 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1294 SIMDType xmm1( y.load(i) );
1296 for(
size_t j=jbegin; j<jend; ++j ) {
1297 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
1303 for( ; remainder && i<M; ++i )
1305 const size_t jbegin( ( IsUpper<MT1>::value )
1306 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1308 const size_t jend( ( IsLower<MT1>::value )
1309 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1315 for(
size_t j=jbegin; j<jend; ++j ) {
1316 value += A(i,j) * x[j];
1339 template<
typename VT1
1342 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1343 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1345 selectDefaultAddAssignKernel( y, A, x );
1364 template<
typename VT1
1367 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1368 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1370 const size_t M( A.rows() );
1371 const size_t N( A.columns() );
1373 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1375 const size_t iblock( 32768UL /
sizeof( ElementType ) );
1376 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1380 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1381 for(
size_t jj=0UL; jj<N; jj+=jblock )
1383 const size_t jend(
min( jj+jblock, N ) );
1384 const size_t itmp(
min( ii+iblock, M ) );
1385 const size_t iend( ( IsUpper<MT1>::value )
1386 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
1389 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1390 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1392 size_t i( ( IsLower<MT1>::value )
1393 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
1396 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1398 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1400 for(
size_t j=jj; j<jend; ++j ) {
1401 const SIMDType x1(
set( x[j] ) );
1402 xmm1 = xmm1 + A.load(i ,j) * x1;
1403 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1404 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1405 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1406 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1407 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1408 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1409 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1412 y.store( i , y.load(i ) + xmm1 );
1413 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1414 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1415 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1416 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5 );
1417 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6 );
1418 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7 );
1419 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8 );
1422 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1424 SIMDType xmm1, xmm2, xmm3, xmm4;
1426 for(
size_t j=jj; j<jend; ++j ) {
1427 const SIMDType x1(
set( x[j] ) );
1428 xmm1 = xmm1 + A.load(i ,j) * x1;
1429 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1430 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1431 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1434 y.store( i , y.load(i ) + xmm1 );
1435 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1436 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1437 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4 );
1440 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1442 SIMDType xmm1, xmm2, xmm3;
1444 for(
size_t j=jj; j<jend; ++j ) {
1445 const SIMDType x1(
set( x[j] ) );
1446 xmm1 = xmm1 + A.load(i ,j) * x1;
1447 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1448 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1451 y.store( i , y.load(i ) + xmm1 );
1452 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2 );
1453 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3 );
1456 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1458 SIMDType xmm1, xmm2;
1460 for(
size_t j=jj; j<jend; ++j ) {
1461 const SIMDType x1(
set( x[j] ) );
1462 xmm1 = xmm1 + A.load(i ,j) * x1;
1463 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
1466 y.store( i , y.load(i ) + xmm1 );
1467 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2 );
1470 for( ; i<ipos; i+=SIMDSIZE )
1474 for(
size_t j=jj; j<jend; ++j ) {
1475 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
1478 y.store( i, y.load(i) + xmm1 );
1481 for( ; remainder && i<iend; ++i )
1485 for(
size_t j=jj; j<jend; ++j ) {
1486 value += A(i,j) * x[j];
1511 template<
typename VT1
1514 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
1515 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1517 selectLargeAddAssignKernel( y, A, x );
1523 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
1537 template<
typename VT1
1540 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
1541 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1543 typedef ElementType_<VT1> ET;
1545 if( IsTriangular<MT1>::value ) {
1546 ResultType_<VT1> tmp(
serial( x ) );
1547 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1548 addAssign( y, tmp );
1551 gemv( y, A, x, ET(1), ET(1) );
1575 template<
typename VT1 >
1576 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
1582 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1586 LT A(
serial( rhs.mat_ ) );
1587 RT x(
serial( rhs.vec_ ) );
1594 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1610 template<
typename VT1
1613 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1615 if( ( IsDiagonal<MT1>::value ) ||
1616 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1617 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
1618 selectSmallSubAssignKernel( y, A, x );
1620 selectBlasSubAssignKernel( y, A, x );
1639 template<
typename VT1
1642 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1644 const size_t M( A.rows() );
1645 const size_t N( A.columns() );
1647 for(
size_t j=0UL; j<N; ++j )
1649 if( IsDiagonal<MT1>::value )
1651 y[j] -= A(j,j) * x[j];
1655 const size_t ibegin( ( IsLower<MT1>::value )
1656 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
1658 const size_t iend( ( IsUpper<MT1>::value )
1659 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1663 const size_t inum( iend - ibegin );
1664 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
1666 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
1667 y[i ] -= A(i ,j) * x[j];
1668 y[i+1UL] -= A(i+1UL,j) * x[j];
1671 y[ipos] -= A(ipos,j) * x[j];
1693 template<
typename VT1
1696 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1697 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1699 selectDefaultSubAssignKernel( y, A, x );
1719 template<
typename VT1
1722 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1723 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1725 const size_t M( A.rows() );
1726 const size_t N( A.columns() );
1728 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1730 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
1735 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1737 const size_t jbegin( ( IsUpper<MT1>::value )
1738 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1740 const size_t jend( ( IsLower<MT1>::value )
1741 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1745 SIMDType xmm1( y.load(i ) );
1746 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1747 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1748 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1749 SIMDType xmm5( y.load(i+SIMDSIZE*4UL) );
1750 SIMDType xmm6( y.load(i+SIMDSIZE*5UL) );
1751 SIMDType xmm7( y.load(i+SIMDSIZE*6UL) );
1752 SIMDType xmm8( y.load(i+SIMDSIZE*7UL) );
1754 for(
size_t j=jbegin; j<jend; ++j ) {
1755 const SIMDType x1(
set( x[j] ) );
1756 xmm1 = xmm1 - A.load(i ,j) * x1;
1757 xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1758 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1759 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,j) * x1;
1760 xmm5 = xmm5 - A.load(i+SIMDSIZE*4UL,j) * x1;
1761 xmm6 = xmm6 - A.load(i+SIMDSIZE*5UL,j) * x1;
1762 xmm7 = xmm7 - A.load(i+SIMDSIZE*6UL,j) * x1;
1763 xmm8 = xmm8 - A.load(i+SIMDSIZE*7UL,j) * x1;
1766 y.store( i , xmm1 );
1767 y.store( i+SIMDSIZE , xmm2 );
1768 y.store( i+SIMDSIZE*2UL, xmm3 );
1769 y.store( i+SIMDSIZE*3UL, xmm4 );
1770 y.store( i+SIMDSIZE*4UL, xmm5 );
1771 y.store( i+SIMDSIZE*5UL, xmm6 );
1772 y.store( i+SIMDSIZE*6UL, xmm7 );
1773 y.store( i+SIMDSIZE*7UL, xmm8 );
1776 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1778 const size_t jbegin( ( IsUpper<MT1>::value )
1779 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1781 const size_t jend( ( IsLower<MT1>::value )
1782 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1786 SIMDType xmm1( y.load(i ) );
1787 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1788 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1789 SIMDType xmm4( y.load(i+SIMDSIZE*3UL) );
1791 for(
size_t j=jbegin; j<jend; ++j ) {
1792 const SIMDType x1(
set( x[j] ) );
1793 xmm1 = xmm1 - A.load(i ,j) * x1;
1794 xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1795 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1796 xmm4 = xmm4 - A.load(i+SIMDSIZE*3UL,j) * x1;
1799 y.store( i , xmm1 );
1800 y.store( i+SIMDSIZE , xmm2 );
1801 y.store( i+SIMDSIZE*2UL, xmm3 );
1802 y.store( i+SIMDSIZE*3UL, xmm4 );
1805 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
1807 const size_t jbegin( ( IsUpper<MT1>::value )
1808 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1810 const size_t jend( ( IsLower<MT1>::value )
1811 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1815 SIMDType xmm1( y.load(i ) );
1816 SIMDType xmm2( y.load(i+SIMDSIZE ) );
1817 SIMDType xmm3( y.load(i+SIMDSIZE*2UL) );
1819 for(
size_t j=jbegin; j<jend; ++j ) {
1820 const SIMDType x1(
set( x[j] ) );
1821 xmm1 = xmm1 - A.load(i ,j) * x1;
1822 xmm2 = xmm2 - A.load(i+SIMDSIZE ,j) * x1;
1823 xmm3 = xmm3 - A.load(i+SIMDSIZE*2UL,j) * x1;
1826 y.store( i , xmm1 );
1827 y.store( i+SIMDSIZE , xmm2 );
1828 y.store( i+SIMDSIZE*2UL, xmm3 );
1831 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
1833 const size_t jbegin( ( IsUpper<MT1>::value )
1834 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1836 const size_t jend( ( IsLower<MT1>::value )
1837 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1841 SIMDType xmm1( y.load(i ) );
1842 SIMDType xmm2( y.load(i+SIMDSIZE) );
1844 for(
size_t j=jbegin; j<jend; ++j ) {
1845 const SIMDType x1(
set( x[j] ) );
1846 xmm1 = xmm1 - A.load(i ,j) * x1;
1847 xmm2 = xmm2 - A.load(i+SIMDSIZE,j) * x1;
1850 y.store( i , xmm1 );
1851 y.store( i+SIMDSIZE, xmm2 );
1854 for( ; i<ipos; i+=SIMDSIZE )
1856 const size_t jbegin( ( IsUpper<MT1>::value )
1857 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1859 const size_t jend( ( IsLower<MT1>::value )
1860 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1864 SIMDType xmm1( y.load(i) );
1866 for(
size_t j=jbegin; j<jend; ++j ) {
1867 xmm1 = xmm1 - A.load(i,j) *
set( x[j] );
1873 for( ; remainder && i<M; ++i )
1875 const size_t jbegin( ( IsUpper<MT1>::value )
1876 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
1878 const size_t jend( ( IsLower<MT1>::value )
1879 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
1885 for(
size_t j=jbegin; j<jend; ++j ) {
1886 value += A(i,j) * x[j];
1909 template<
typename VT1
1912 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1913 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1915 selectDefaultSubAssignKernel( y, A, x );
1935 template<
typename VT1
1938 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2> >
1939 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1941 const size_t M( A.rows() );
1942 const size_t N( A.columns() );
1944 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
1946 const size_t iblock( 32768UL /
sizeof( ElementType ) );
1947 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
1951 for(
size_t ii=0U; ii<M; ii+=iblock ) {
1952 for(
size_t jj=0UL; jj<N; jj+=jblock )
1954 const size_t jend(
min( jj+jblock, N ) );
1955 const size_t itmp(
min( ii+iblock, M ) );
1956 const size_t iend( ( IsUpper<MT1>::value )
1957 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
1960 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
1961 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
1963 size_t i( ( IsLower<MT1>::value )
1964 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
1967 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
1969 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1971 for(
size_t j=jj; j<jend; ++j ) {
1972 const SIMDType x1(
set( x[j] ) );
1973 xmm1 = xmm1 + A.load(i ,j) * x1;
1974 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
1975 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
1976 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
1977 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
1978 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
1979 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
1980 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
1983 y.store( i , y.load(i ) - xmm1 );
1984 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
1985 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
1986 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
1987 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5 );
1988 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6 );
1989 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7 );
1990 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8 );
1993 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
1995 SIMDType xmm1, xmm2, xmm3, xmm4;
1997 for(
size_t j=jj; j<jend; ++j ) {
1998 const SIMDType x1(
set( x[j] ) );
1999 xmm1 = xmm1 + A.load(i ,j) * x1;
2000 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2001 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2002 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2005 y.store( i , y.load(i ) - xmm1 );
2006 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2007 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2008 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4 );
2011 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2013 SIMDType xmm1, xmm2, xmm3;
2015 for(
size_t j=jj; j<jend; ++j ) {
2016 const SIMDType x1(
set( x[j] ) );
2017 xmm1 = xmm1 + A.load(i ,j) * x1;
2018 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2019 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2022 y.store( i , y.load(i ) - xmm1 );
2023 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2 );
2024 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3 );
2027 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
2029 SIMDType xmm1, xmm2;
2031 for(
size_t j=jj; j<jend; ++j ) {
2032 const SIMDType x1(
set( x[j] ) );
2033 xmm1 = xmm1 + A.load(i ,j) * x1;
2034 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
2037 y.store( i , y.load(i ) - xmm1 );
2038 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2 );
2041 for( ; i<ipos; i+=SIMDSIZE )
2045 for(
size_t j=jj; j<jend; ++j ) {
2046 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
2049 y.store( i, y.load(i) - xmm1 );
2052 for( ; remainder && i<iend; ++i )
2056 for(
size_t j=jj; j<jend; ++j ) {
2057 value += A(i,j) * x[j];
2082 template<
typename VT1
2085 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2> >
2086 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2088 selectLargeSubAssignKernel( y, A, x );
2094 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
2108 template<
typename VT1
2111 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2> >
2112 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2114 typedef ElementType_<VT1> ET;
2116 if( IsTriangular<MT1>::value ) {
2117 ResultType_<VT1> tmp(
serial( x ) );
2118 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2119 subAssign( y, tmp );
2122 gemv( y, A, x, ET(-1), ET(1) );
2146 template<
typename VT1 >
2147 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2157 const ResultType tmp(
serial( rhs ) );
2158 multAssign( ~lhs, tmp );
2180 template<
typename VT1 >
2181 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const TDMatDVecMultExpr& rhs )
2191 const ResultType tmp(
serial( rhs ) );
2192 divAssign( ~lhs, tmp );
2216 template<
typename VT1 >
2217 friend inline EnableIf_< UseSMPAssign<VT1> >
2224 if( rhs.mat_.rows() == 0UL ) {
2227 else if( rhs.mat_.columns() == 0UL ) {
2260 template<
typename VT1 >
2261 friend inline EnableIf_< UseSMPAssign<VT1> >
2272 const ResultType tmp( rhs );
2293 template<
typename VT1 >
2294 friend inline EnableIf_< UseSMPAssign<VT1> >
2301 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2337 template<
typename VT1 >
2338 friend inline EnableIf_< UseSMPAssign<VT1> >
2345 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2381 template<
typename VT1 >
2382 friend inline EnableIf_< UseSMPAssign<VT1> >
2393 const ResultType tmp( rhs );
2418 template<
typename VT1 >
2419 friend inline EnableIf_< UseSMPAssign<VT1> >
2430 const ResultType tmp( rhs );
2470 template<
typename MT
2474 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
2475 ,
private VecScalarMultExpr
2476 ,
private Computation
2480 typedef TDMatDVecMultExpr<MT,VT> MVM;
2481 typedef ResultType_<MVM> RES;
2482 typedef ResultType_<MT>
MRT;
2483 typedef ResultType_<VT>
VRT;
2484 typedef ElementType_<MRT>
MET;
2485 typedef ElementType_<VRT>
VET;
2486 typedef CompositeType_<MT>
MCT;
2487 typedef CompositeType_<VT>
VCT;
2492 enum :
bool { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2493 IsBLASCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2498 enum :
bool { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2506 template<
typename T1 >
2507 struct UseSMPAssign {
2508 enum :
bool { value = ( evaluateMatrix || evaluateVector ) };
2516 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2517 struct UseBlasKernel {
2519 HasMutableDataAccess<T1>::value &&
2520 HasConstDataAccess<T2>::value &&
2521 HasConstDataAccess<T3>::value &&
2522 !IsDiagonal<T2>::value &&
2523 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2524 IsBLASCompatible< ElementType_<T1> >::value &&
2525 IsBLASCompatible< ElementType_<T2> >::value &&
2526 IsBLASCompatible< ElementType_<T3> >::value &&
2527 IsSame< ElementType_<T1>, ElementType_<T2> >::value &&
2528 IsSame< ElementType_<T1>, ElementType_<T3> >::value &&
2529 !( IsBuiltin< ElementType_<T1> >::value && IsComplex<T4>::value ) };
2538 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2539 struct UseVectorizedDefaultKernel {
2541 !IsDiagonal<T2>::value &&
2542 T1::simdEnabled && T2::simdEnabled && T3::simdEnabled &&
2543 AreSIMDCombinable< ElementType_<T1>
2547 HasSIMDAdd< ElementType_<T2>, ElementType_<T3> >::value &&
2548 HasSIMDMult< ElementType_<T2>, ElementType_<T3> >::value };
2554 typedef DVecScalarMultExpr<MVM,ST,false>
This;
2558 typedef SIMDTrait_<ElementType>
SIMDType;
2563 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
2569 typedef IfTrue_< evaluateMatrix, const MRT, MCT >
LT;
2572 typedef IfTrue_< evaluateVector, const VRT, VCT >
RT;
2577 enum :
bool { simdEnabled = !IsDiagonal<MT>::value &&
2578 MT::simdEnabled && VT::simdEnabled &&
2579 AreSIMDCombinable<MET,VET,ST>::value &&
2580 HasSIMDAdd<MET,VET>::value &&
2581 HasSIMDMult<MET,VET>::value };
2584 enum :
bool { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2585 !evaluateVector && VT::smpAssignable };
2599 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2611 inline ReturnType
operator[](
size_t index )
const {
2613 return vector_[index] * scalar_;
2624 inline ReturnType
at(
size_t index )
const {
2625 if( index >= vector_.size() ) {
2628 return (*
this)[index];
2637 inline size_t size()
const {
2638 return vector_.size();
2668 template<
typename T >
2669 inline bool canAlias(
const T* alias )
const {
2670 return vector_.canAlias( alias );
2680 template<
typename T >
2681 inline bool isAliased(
const T* alias )
const {
2682 return vector_.isAliased( alias );
2692 return vector_.isAligned();
2702 LeftOperand_<MVM> A( vector_.leftOperand() );
2704 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2705 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) ) &&
2706 (
size() > SMP_TDMATDVECMULT_THRESHOLD );
2712 LeftOperand vector_;
2713 RightOperand scalar_;
2728 template<
typename VT1 >
2729 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2735 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
2736 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
2738 if( left.rows() == 0UL ) {
2741 else if( left.columns() == 0UL ) {
2754 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2769 template<
typename VT1
2773 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2775 if( ( IsDiagonal<MT1>::value ) ||
2776 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2777 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
2778 selectSmallAssignKernel( y, A, x, scalar );
2780 selectBlasAssignKernel( y, A, x, scalar );
2798 template<
typename VT1
2802 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2804 const size_t M( A.rows() );
2805 const size_t N( A.columns() );
2807 if( IsStrictlyLower<MT1>::value ) {
2811 if( !IsUpper<MT1>::value )
2813 for(
size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<M; ++i ) {
2814 y[i] = A(i,0UL) * x[0UL];
2818 for(
size_t j=( IsUpper<MT1>::value && !IsStrictlyUpper<MT1>::value ? 0UL : 1UL ); j<N; ++j )
2820 if( IsDiagonal<MT1>::value )
2822 y[j] = A(j,j) * x[j] * scalar;
2826 const size_t ibegin( ( IsLower<MT1>::value )
2827 ?( IsStrictlyLower<MT1>::value ? j+1UL : j )
2829 const size_t iend( ( IsUpper<MT1>::value )
2830 ?( IsStrictlyUpper<MT1>::value ? j-1UL : j )
2834 const size_t inum( iend - ibegin );
2835 const size_t ipos( ibegin + ( inum &
size_t(-2) ) );
2837 for(
size_t i=ibegin; i<ipos; i+=2UL ) {
2838 y[i ] += A(i ,j) * x[j];
2839 y[i+1UL] += A(i+1UL,j) * x[j];
2842 y[ipos] += A(ipos,j) * x[j];
2844 if( IsUpper<MT1>::value ) {
2845 y[iend] = A(iend,j) * x[j];
2850 if( IsStrictlyUpper<MT1>::value ) {
2854 if( !IsDiagonal<MT1>::value )
2856 const size_t iend( IsStrictlyUpper<MT1>::value ? M-1UL : M );
2857 for(
size_t i=( IsStrictlyLower<MT1>::value ? 1UL : 0UL ); i<iend; ++i ) {
2878 template<
typename VT1
2882 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
2883 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2885 selectDefaultAssignKernel( y, A, x, scalar );
2903 template<
typename VT1
2907 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
2908 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2910 const size_t M( A.rows() );
2911 const size_t N( A.columns() );
2913 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
2915 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
2918 const SIMDType factor(
set( scalar ) );
2922 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
2924 const size_t jbegin( ( IsUpper<MT1>::value )
2925 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2927 const size_t jend( ( IsLower<MT1>::value )
2928 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2932 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2934 for(
size_t j=jbegin; j<jend; ++j ) {
2935 const SIMDType x1(
set( x[j] ) );
2936 xmm1 = xmm1 + A.load(i ,j) * x1;
2937 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2938 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2939 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2940 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
2941 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
2942 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
2943 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
2946 y.store( i , xmm1*factor );
2947 y.store( i+SIMDSIZE , xmm2*factor );
2948 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2949 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2950 y.store( i+SIMDSIZE*4UL, xmm5*factor );
2951 y.store( i+SIMDSIZE*5UL, xmm6*factor );
2952 y.store( i+SIMDSIZE*6UL, xmm7*factor );
2953 y.store( i+SIMDSIZE*7UL, xmm8*factor );
2956 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
2958 const size_t jbegin( ( IsUpper<MT1>::value )
2959 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2961 const size_t jend( ( IsLower<MT1>::value )
2962 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2966 SIMDType xmm1, xmm2, xmm3, xmm4;
2968 for(
size_t j=jbegin; j<jend; ++j ) {
2969 const SIMDType x1(
set( x[j] ) );
2970 xmm1 = xmm1 + A.load(i ,j) * x1;
2971 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2972 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
2973 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
2976 y.store( i , xmm1*factor );
2977 y.store( i+SIMDSIZE , xmm2*factor );
2978 y.store( i+SIMDSIZE*2UL, xmm3*factor );
2979 y.store( i+SIMDSIZE*3UL, xmm4*factor );
2982 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
2984 const size_t jbegin( ( IsUpper<MT1>::value )
2985 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
2987 const size_t jend( ( IsLower<MT1>::value )
2988 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
2992 SIMDType xmm1, xmm2, xmm3;
2994 for(
size_t j=jbegin; j<jend; ++j ) {
2995 const SIMDType x1(
set( x[j] ) );
2996 xmm1 = xmm1 + A.load(i ,j) * x1;
2997 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
2998 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3001 y.store( i , xmm1*factor );
3002 y.store( i+SIMDSIZE , xmm2*factor );
3003 y.store( i+SIMDSIZE*2UL, xmm3*factor );
3006 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3008 const size_t jbegin( ( IsUpper<MT1>::value )
3009 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3011 const size_t jend( ( IsLower<MT1>::value )
3012 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3016 SIMDType xmm1, xmm2;
3018 for(
size_t j=jbegin; j<jend; ++j ) {
3019 const SIMDType x1(
set( x[j] ) );
3020 xmm1 = xmm1 + A.load(i ,j) * x1;
3021 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3024 y.store( i , xmm1*factor );
3025 y.store( i+SIMDSIZE, xmm2*factor );
3028 for( ; i<ipos; i+=SIMDSIZE )
3030 const size_t jbegin( ( IsUpper<MT1>::value )
3031 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3033 const size_t jend( ( IsLower<MT1>::value )
3034 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3040 for(
size_t j=jbegin; j<jend; ++j ) {
3041 const SIMDType x1(
set( x[j] ) );
3042 xmm1 = xmm1 + A.load(i,j) * x1;
3045 y.store( i, xmm1*factor );
3048 for( ; remainder && i<M; ++i )
3050 const size_t jbegin( ( IsUpper<MT1>::value )
3051 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3053 const size_t jend( ( IsLower<MT1>::value )
3054 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3060 for(
size_t j=jbegin; j<jend; ++j ) {
3061 value += A(i,j) * x[j];
3064 y[i] = value * scalar;
3083 template<
typename VT1
3087 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3088 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3090 selectDefaultAssignKernel( y, A, x, scalar );
3108 template<
typename VT1
3112 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3113 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3115 const size_t M( A.rows() );
3116 const size_t N( A.columns() );
3118 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3120 const size_t iblock( 32768UL /
sizeof( ElementType ) );
3121 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3125 const SIMDType factor(
set( scalar ) );
3129 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3130 for(
size_t jj=0UL; jj<N; jj+=jblock )
3132 const size_t jend(
min( jj+jblock, N ) );
3133 const size_t itmp(
min( ii+iblock, M ) );
3134 const size_t iend( ( IsUpper<MT1>::value )
3135 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
3138 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3139 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3141 size_t i( ( IsLower<MT1>::value )
3142 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
3145 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3147 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3149 for(
size_t j=jj; j<jend; ++j ) {
3150 const SIMDType x1(
set( x[j] ) );
3151 xmm1 = xmm1 + A.load(i ,j) * x1;
3152 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3153 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3154 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3155 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3156 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3157 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3158 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3161 y.store( i , y.load(i ) + xmm1*factor );
3162 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3163 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3164 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3165 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3166 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3167 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3168 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3171 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3173 SIMDType xmm1, xmm2, xmm3, xmm4;
3175 for(
size_t j=jj; j<jend; ++j ) {
3176 const SIMDType x1(
set( x[j] ) );
3177 xmm1 = xmm1 + A.load(i ,j) * x1;
3178 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3179 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3180 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3183 y.store( i , y.load(i ) + xmm1*factor );
3184 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3185 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3186 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3189 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3191 SIMDType xmm1, xmm2, xmm3;
3193 for(
size_t j=jj; j<jend; ++j ) {
3194 const SIMDType x1(
set( x[j] ) );
3195 xmm1 = xmm1 + A.load(i ,j) * x1;
3196 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3197 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3200 y.store( i , y.load(i ) + xmm1*factor );
3201 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3202 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3205 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3207 SIMDType xmm1, xmm2;
3209 for(
size_t j=jj; j<jend; ++j ) {
3210 const SIMDType x1(
set( x[j] ) );
3211 xmm1 = xmm1 + A.load(i ,j) * x1;
3212 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3215 y.store( i , y.load(i ) + xmm1*factor );
3216 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3219 for( ; i<ipos; i+=SIMDSIZE )
3223 for(
size_t j=jj; j<jend; ++j ) {
3224 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
3227 y.store( i, y.load(i) + xmm1*factor );
3230 for( ; remainder && i<iend; ++i )
3234 for(
size_t j=jj; j<jend; ++j ) {
3235 value += A(i,j) * x[j];
3238 y[i] += value * scalar;
3259 template<
typename VT1
3263 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3264 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3266 selectLargeAssignKernel( y, A, x, scalar );
3271 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3285 template<
typename VT1
3289 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3290 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3292 typedef ElementType_<VT1> ET;
3294 if( IsTriangular<MT1>::value ) {
3295 assign( y, scalar * x );
3296 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3299 gemv( y, A, x, ET(scalar), ET(0) );
3317 template<
typename VT1 >
3318 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3328 const ResultType tmp(
serial( rhs ) );
3329 assign( ~lhs, tmp );
3345 template<
typename VT1 >
3346 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3352 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3353 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3355 if( left.rows() == 0UL || left.columns() == 0UL ) {
3367 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3382 template<
typename VT1
3386 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3388 if( ( IsDiagonal<MT1>::value ) ||
3389 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3390 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3391 selectSmallAddAssignKernel( y, A, x, scalar );
3393 selectBlasAddAssignKernel( y, A, x, scalar );
3411 template<
typename VT1
3415 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3417 y.addAssign( A * x * scalar );
3435 template<
typename VT1
3439 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3440 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3442 selectDefaultAddAssignKernel( y, A, x, scalar );
3461 template<
typename VT1
3465 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3466 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3468 const size_t M( A.rows() );
3469 const size_t N( A.columns() );
3471 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3473 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
3476 const SIMDType factor(
set( scalar ) );
3480 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3482 const size_t jbegin( ( IsUpper<MT1>::value )
3483 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3485 const size_t jend( ( IsLower<MT1>::value )
3486 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3490 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3492 for(
size_t j=jbegin; j<jend; ++j ) {
3493 const SIMDType x1(
set( x[j] ) );
3494 xmm1 = xmm1 + A.load(i ,j) * x1;
3495 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3496 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3497 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3498 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3499 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3500 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3501 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3504 y.store( i , y.load(i ) + xmm1*factor );
3505 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3506 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3507 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3508 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3509 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3510 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3511 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3514 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3516 const size_t jbegin( ( IsUpper<MT1>::value )
3517 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3519 const size_t jend( ( IsLower<MT1>::value )
3520 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3524 SIMDType xmm1, xmm2, xmm3, xmm4;
3526 for(
size_t j=jbegin; j<jend; ++j ) {
3527 const SIMDType x1(
set( x[j] ) );
3528 xmm1 = xmm1 + A.load(i ,j) * x1;
3529 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3530 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3531 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3534 y.store( i , y.load(i ) + xmm1*factor );
3535 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3536 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3537 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3540 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3542 const size_t jbegin( ( IsUpper<MT1>::value )
3543 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3545 const size_t jend( ( IsLower<MT1>::value )
3546 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3550 SIMDType xmm1, xmm2, xmm3;
3552 for(
size_t j=jbegin; j<jend; ++j ) {
3553 const SIMDType x1(
set( x[j] ) );
3554 xmm1 = xmm1 + A.load(i ,j) * x1;
3555 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3556 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3559 y.store( i , y.load(i ) + xmm1*factor );
3560 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3561 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3564 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3566 const size_t jbegin( ( IsUpper<MT1>::value )
3567 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3569 const size_t jend( ( IsLower<MT1>::value )
3570 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3574 SIMDType xmm1, xmm2;
3576 for(
size_t j=jbegin; j<jend; ++j ) {
3577 const SIMDType x1(
set( x[j] ) );
3578 xmm1 = xmm1 + A.load(i ,j) * x1;
3579 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3582 y.store( i , y.load(i ) + xmm1*factor );
3583 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3586 for( ; i<ipos; i+=SIMDSIZE )
3588 const size_t jbegin( ( IsUpper<MT1>::value )
3589 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3591 const size_t jend( ( IsLower<MT1>::value )
3592 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3598 for(
size_t j=jbegin; j<jend; ++j ) {
3599 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
3602 y.store( i, y.load(i) + xmm1*factor );
3605 for( ; remainder && i<M; ++i )
3607 const size_t jbegin( ( IsUpper<MT1>::value )
3608 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
3610 const size_t jend( ( IsLower<MT1>::value )
3611 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
3617 for(
size_t j=jbegin; j<jend; ++j ) {
3618 value += A(i,j) * x[j];
3621 y[i] += value * scalar;
3640 template<
typename VT1
3644 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3645 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3647 selectDefaultAddAssignKernel( y, A, x, scalar );
3666 template<
typename VT1
3670 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3671 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3673 const size_t M( A.rows() );
3674 const size_t N( A.columns() );
3676 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
3678 const size_t iblock( 32768UL /
sizeof( ElementType ) );
3679 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
3683 const SIMDType factor(
set( scalar ) );
3685 for(
size_t ii=0U; ii<M; ii+=iblock ) {
3686 for(
size_t jj=0UL; jj<N; jj+=jblock )
3688 const size_t jend(
min( jj+jblock, N ) );
3689 const size_t itmp(
min( ii+iblock, M ) );
3690 const size_t iend( ( IsUpper<MT1>::value )
3691 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
3694 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
3695 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
3697 size_t i( ( IsLower<MT1>::value )
3698 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
3701 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
3703 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3705 for(
size_t j=jj; j<jend; ++j ) {
3706 const SIMDType x1(
set( x[j] ) );
3707 xmm1 = xmm1 + A.load(i ,j) * x1;
3708 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3709 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3710 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3711 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
3712 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
3713 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
3714 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
3717 y.store( i , y.load(i ) + xmm1*factor );
3718 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3719 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3720 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3721 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) + xmm5*factor );
3722 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) + xmm6*factor );
3723 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) + xmm7*factor );
3724 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) + xmm8*factor );
3727 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
3729 SIMDType xmm1, xmm2, xmm3, xmm4;
3731 for(
size_t j=jj; j<jend; ++j ) {
3732 const SIMDType x1(
set( x[j] ) );
3733 xmm1 = xmm1 + A.load(i ,j) * x1;
3734 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3735 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3736 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
3739 y.store( i , y.load(i ) + xmm1*factor );
3740 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3741 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3742 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) + xmm4*factor );
3745 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
3747 SIMDType xmm1, xmm2, xmm3;
3749 for(
size_t j=jj; j<jend; ++j ) {
3750 const SIMDType x1(
set( x[j] ) );
3751 xmm1 = xmm1 + A.load(i ,j) * x1;
3752 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
3753 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
3756 y.store( i , y.load(i ) + xmm1*factor );
3757 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) + xmm2*factor );
3758 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) + xmm3*factor );
3761 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
3763 SIMDType xmm1, xmm2;
3765 for(
size_t j=jj; j<jend; ++j ) {
3766 const SIMDType x1(
set( x[j] ) );
3767 xmm1 = xmm1 + A.load(i ,j) * x1;
3768 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
3771 y.store( i , y.load(i ) + xmm1*factor );
3772 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) + xmm2*factor );
3775 for( ; i<ipos; i+=SIMDSIZE )
3779 for(
size_t j=jj; j<jend; ++j ) {
3780 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
3783 y.store( i, y.load(i) + xmm1*factor );
3786 for( ; remainder && i<iend; ++i )
3790 for(
size_t j=jj; j<jend; ++j ) {
3791 value += A(i,j) * x[j];
3794 y[i] += value * scalar;
3815 template<
typename VT1
3819 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3820 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3822 selectLargeAddAssignKernel( y, A, x, scalar );
3827 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
3841 template<
typename VT1
3845 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
3846 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3848 typedef ElementType_<VT1> ET;
3850 if( IsTriangular<MT1>::value ) {
3851 ResultType_<VT1> tmp(
serial( scalar * x ) );
3852 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3853 addAssign( y, tmp );
3856 gemv( y, A, x, ET(scalar), ET(1) );
3878 template<
typename VT1 >
3879 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3885 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
3886 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
3888 if( left.rows() == 0UL || left.columns() == 0UL ) {
3900 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
3915 template<
typename VT1
3919 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3921 if( ( IsDiagonal<MT1>::value ) ||
3922 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3923 ( A.rows() * A.columns() < TDMATDVECMULT_THRESHOLD ) )
3924 selectSmallSubAssignKernel( y, A, x, scalar );
3926 selectBlasSubAssignKernel( y, A, x, scalar );
3944 template<
typename VT1
3948 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3950 y.subAssign( A * x * scalar );
3968 template<
typename VT1
3972 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3973 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3975 selectDefaultSubAssignKernel( y, A, x, scalar );
3994 template<
typename VT1
3998 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
3999 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4001 const size_t M( A.rows() );
4002 const size_t N( A.columns() );
4004 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
4006 const size_t ipos( remainder ? ( M &
size_t(-SIMDSIZE) ) : M );
4009 const SIMDType factor(
set( scalar ) );
4013 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4015 const size_t jbegin( ( IsUpper<MT1>::value )
4016 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4018 const size_t jend( ( IsLower<MT1>::value )
4019 ?(
min( i+SIMDSIZE*8UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4023 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4025 for(
size_t j=jbegin; j<jend; ++j ) {
4026 const SIMDType x1(
set( x[j] ) );
4027 xmm1 = xmm1 + A.load(i ,j) * x1;
4028 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4029 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4030 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4031 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
4032 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
4033 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
4034 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
4037 y.store( i , y.load(i ) - xmm1*factor );
4038 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4039 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4040 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4041 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4042 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4043 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4044 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4047 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4049 const size_t jbegin( ( IsUpper<MT1>::value )
4050 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4052 const size_t jend( ( IsLower<MT1>::value )
4053 ?(
min( i+SIMDSIZE*4UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4057 SIMDType xmm1, xmm2, xmm3, xmm4;
4059 for(
size_t j=jbegin; j<jend; ++j ) {
4060 const SIMDType x1(
set( x[j] ) );
4061 xmm1 = xmm1 + A.load(i ,j) * x1;
4062 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4063 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4064 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4067 y.store( i , y.load(i ) - xmm1*factor );
4068 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4069 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4070 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4073 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4075 const size_t jbegin( ( IsUpper<MT1>::value )
4076 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4078 const size_t jend( ( IsLower<MT1>::value )
4079 ?(
min( i+SIMDSIZE*3UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4083 SIMDType xmm1, xmm2, xmm3;
4085 for(
size_t j=jbegin; j<jend; ++j ) {
4086 const SIMDType x1(
set( x[j] ) );
4087 xmm1 = xmm1 + A.load(i ,j) * x1;
4088 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4089 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4092 y.store( i , y.load(i ) - xmm1*factor );
4093 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4094 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4097 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4099 const size_t jbegin( ( IsUpper<MT1>::value )
4100 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4102 const size_t jend( ( IsLower<MT1>::value )
4103 ?(
min( i+SIMDSIZE*2UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4107 SIMDType xmm1, xmm2;
4109 for(
size_t j=jbegin; j<jend; ++j ) {
4110 const SIMDType x1(
set( x[j] ) );
4111 xmm1 = xmm1 + A.load(i ,j) * x1;
4112 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
4115 y.store( i , y.load(i ) - xmm1*factor );
4116 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4119 for( ; i<ipos; i+=SIMDSIZE )
4121 const size_t jbegin( ( IsUpper<MT1>::value )
4122 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4124 const size_t jend( ( IsLower<MT1>::value )
4125 ?(
min( i+SIMDSIZE, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4131 for(
size_t j=jbegin; j<jend; ++j ) {
4132 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
4135 y.store( i, y.load(i) - xmm1*factor );
4138 for( ; remainder && i<M; ++i )
4140 const size_t jbegin( ( IsUpper<MT1>::value )
4141 ?( IsStrictlyUpper<MT1>::value ? i+1UL : i )
4143 const size_t jend( ( IsLower<MT1>::value )
4144 ?(
min( i+1UL, N ) - ( IsStrictlyLower<MT1>::value ? 1UL : 0UL ) )
4150 for(
size_t j=jbegin; j<jend; ++j ) {
4151 value += A(i,j) * x[j];
4154 y[i] -= value * scalar;
4173 template<
typename VT1
4177 static inline DisableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4178 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4180 selectDefaultSubAssignKernel( y, A, x, scalar );
4199 template<
typename VT1
4203 static inline EnableIf_< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >
4204 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4206 const size_t M( A.rows() );
4207 const size_t N( A.columns() );
4209 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT1>::value );
4211 const size_t iblock( 32768UL /
sizeof( ElementType ) );
4212 const size_t jblock( ( N < iblock )?( 8UL ):( 4UL ) );
4216 const SIMDType factor(
set( scalar ) );
4218 for(
size_t ii=0U; ii<M; ii+=iblock ) {
4219 for(
size_t jj=0UL; jj<N; jj+=jblock )
4221 const size_t jend(
min( jj+jblock, N ) );
4222 const size_t itmp(
min( ii+iblock, M ) );
4223 const size_t iend( ( IsUpper<MT1>::value )
4224 ?(
min( itmp, ( IsStrictlyUpper<MT1>::value ? jend-1UL : jend ) ) )
4227 const size_t ipos( remainder ? ( iend &
size_t(-SIMDSIZE) ) : iend );
4228 BLAZE_INTERNAL_ASSERT( !remainder || ( iend - ( iend % SIMDSIZE ) ) == ipos,
"Invalid end calculation" );
4230 size_t i( ( IsLower<MT1>::value )
4231 ?(
max( ii, ( IsStrictlyLower<MT1>::value ? jj+1UL : jj ) &
size_t(-SIMDSIZE) ) )
4234 for( ; (i+SIMDSIZE*7UL) < ipos; i+=SIMDSIZE*8UL )
4236 SIMDType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4238 for(
size_t j=jj; j<jend; ++j ) {
4239 const SIMDType x1(
set( x[j] ) );
4240 xmm1 = xmm1 + A.load(i ,j) * x1;
4241 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4242 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4243 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4244 xmm5 = xmm5 + A.load(i+SIMDSIZE*4UL,j) * x1;
4245 xmm6 = xmm6 + A.load(i+SIMDSIZE*5UL,j) * x1;
4246 xmm7 = xmm7 + A.load(i+SIMDSIZE*6UL,j) * x1;
4247 xmm8 = xmm8 + A.load(i+SIMDSIZE*7UL,j) * x1;
4250 y.store( i , y.load(i ) - xmm1*factor );
4251 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4252 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4253 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4254 y.store( i+SIMDSIZE*4UL, y.load(i+SIMDSIZE*4UL) - xmm5*factor );
4255 y.store( i+SIMDSIZE*5UL, y.load(i+SIMDSIZE*5UL) - xmm6*factor );
4256 y.store( i+SIMDSIZE*6UL, y.load(i+SIMDSIZE*6UL) - xmm7*factor );
4257 y.store( i+SIMDSIZE*7UL, y.load(i+SIMDSIZE*7UL) - xmm8*factor );
4260 for( ; (i+SIMDSIZE*3UL) < ipos; i+=SIMDSIZE*4UL )
4262 SIMDType xmm1, xmm2, xmm3, xmm4;
4264 for(
size_t j=jj; j<jend; ++j ) {
4265 const SIMDType x1(
set( x[j] ) );
4266 xmm1 = xmm1 + A.load(i ,j) * x1;
4267 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4268 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4269 xmm4 = xmm4 + A.load(i+SIMDSIZE*3UL,j) * x1;
4272 y.store( i , y.load(i ) - xmm1*factor );
4273 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4274 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4275 y.store( i+SIMDSIZE*3UL, y.load(i+SIMDSIZE*3UL) - xmm4*factor );
4278 for( ; (i+SIMDSIZE*2UL) < ipos; i+=SIMDSIZE*3UL )
4280 SIMDType xmm1, xmm2, xmm3;
4282 for(
size_t j=jj; j<jend; ++j ) {
4283 const SIMDType x1(
set( x[j] ) );
4284 xmm1 = xmm1 + A.load(i ,j) * x1;
4285 xmm2 = xmm2 + A.load(i+SIMDSIZE ,j) * x1;
4286 xmm3 = xmm3 + A.load(i+SIMDSIZE*2UL,j) * x1;
4289 y.store( i , y.load(i ) - xmm1*factor );
4290 y.store( i+SIMDSIZE , y.load(i+SIMDSIZE ) - xmm2*factor );
4291 y.store( i+SIMDSIZE*2UL, y.load(i+SIMDSIZE*2UL) - xmm3*factor );
4294 for( ; (i+SIMDSIZE) < ipos; i+=SIMDSIZE*2UL )
4296 SIMDType xmm1, xmm2;
4298 for(
size_t j=jj; j<jend; ++j ) {
4299 const SIMDType x1(
set( x[j] ) );
4300 xmm1 = xmm1 + A.load(i ,j) * x1;
4301 xmm2 = xmm2 + A.load(i+SIMDSIZE,j) * x1;
4304 y.store( i , y.load(i ) - xmm1*factor );
4305 y.store( i+SIMDSIZE, y.load(i+SIMDSIZE) - xmm2*factor );
4308 for( ; i<ipos; i+=SIMDSIZE )
4312 for(
size_t j=jj; j<jend; ++j ) {
4313 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
4316 y.store( i, y.load(i) - xmm1*factor );
4319 for( ; remainder && i<iend; ++i )
4323 for(
size_t j=jj; j<jend; ++j ) {
4324 value += A(i,j) * x[j];
4327 y[i] -= value * scalar;
4348 template<
typename VT1
4352 static inline DisableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4353 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4355 selectLargeSubAssignKernel( y, A, x, scalar );
4360 #if BLAZE_BLAS_MODE && BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
4374 template<
typename VT1
4378 static inline EnableIf_< UseBlasKernel<VT1,MT1,VT2,ST2> >
4379 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4381 typedef ElementType_<VT1> ET;
4383 if( IsTriangular<MT1>::value ) {
4384 ResultType_<VT1> tmp(
serial( scalar * x ) );
4385 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4386 subAssign( y, tmp );
4389 gemv( y, A, x, ET(-scalar), ET(1) );
4411 template<
typename VT1 >
4412 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4422 const ResultType tmp(
serial( rhs ) );
4423 multAssign( ~lhs, tmp );
4443 template<
typename VT1 >
4444 friend inline void divAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4454 const ResultType tmp(
serial( rhs ) );
4455 divAssign( ~lhs, tmp );
4477 template<
typename VT1 >
4478 friend inline EnableIf_< UseSMPAssign<VT1> >
4479 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4485 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4486 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4488 if( left.rows() == 0UL ) {
4491 else if( left.columns() == 0UL ) {
4522 template<
typename VT1 >
4523 friend inline EnableIf_< UseSMPAssign<VT1> >
4524 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4534 const ResultType tmp( rhs );
4553 template<
typename VT1 >
4554 friend inline EnableIf_< UseSMPAssign<VT1> >
4555 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4561 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4562 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4564 if( left.rows() == 0UL || left.columns() == 0UL ) {
4598 template<
typename VT1 >
4599 friend inline EnableIf_< UseSMPAssign<VT1> >
4600 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4606 LeftOperand_<MVM> left ( rhs.vector_.leftOperand() );
4607 RightOperand_<MVM> right( rhs.vector_.rightOperand() );
4609 if( left.rows() == 0UL || left.columns() == 0UL ) {
4644 template<
typename VT1 >
4645 friend inline EnableIf_< UseSMPAssign<VT1> >
4646 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4656 const ResultType tmp( rhs );
4679 template<
typename VT1 >
4680 friend inline EnableIf_< UseSMPAssign<VT1> >
4681 smpDivAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4691 const ResultType tmp( rhs );
4754 template<
typename T1
4756 inline const DisableIf_< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >
4780 template<
typename MT,
typename VT >
4797 template<
typename MT,
typename VT >
4799 :
public BoolConstant< And< IsAligned<MT>, IsAligned<VT> >::value >
4815 template<
typename MT,
typename VT,
bool AF >
4820 using Type = MultExprTrait_< SubmatrixExprTrait_<const MT,AF>
4821 , SubvectorExprTrait_<const VT,AF> >;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exception.This macro encapsulates the default way o...
Definition: Exception.h:235
ResultType_< MT > MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:131
Header file for auxiliary alias declarations.
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:72
Header file for mathematical functions.
constexpr bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
Header file for the Rows type trait.
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7800
MultTrait_< MRT, VRT > ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:208
Header file for basic type definitions.
CompositeType_< MT > MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:135
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
bool canSMPAssign() const noexcept
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:374
Header file for the serial shim.
Header file for the IsDiagonal type trait.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector) noexcept
Returns the current size/dimension of the vector.
Definition: Vector.h:258
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:207
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:61
Header file for the IsSame and IsStrictlySame type traits.
BLAZE_ALWAYS_INLINE MT::Iterator begin(Matrix< MT, SO > &matrix, size_t i)
Returns an iterator to the first element of row/column i.
Definition: Matrix.h:188
Availability of a SIMD multiplication for the given data types.Depending on the available instruction...
Definition: HasSIMDMult.h:162
typename SIMDTrait< T >::Type SIMDTrait_
Auxiliary alias declaration for the SIMDTrait class template.The SIMDTrait_ alias declaration provide...
Definition: SIMDTrait.h:315
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:533
EnableIf_< IsDenseVector< VT1 > > smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:193
Header file for the And class template.
const ElementType_< MT > min(const DenseMatrix< MT, SO > &dm)
Returns the smallest element of the dense matrix.
Definition: DenseMatrix.h:1669
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:88
Availability of a SIMD addition for the given data types.Depending on the available instruction set (...
Definition: HasSIMDAdd.h:162
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:723
typename MultTrait< T1, T2 >::Type MultTrait_
Auxiliary alias declaration for the MultTrait class template.The MultTrait_ alias declaration provide...
Definition: MultTrait.h:245
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member constant is set to true, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to false, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:138
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:88
If_< IsExpression< MT >, const MT, const MT & > LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:216
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
Header file for the VecScalarMultExpr base class.
Header file for the IsBLASCompatible type trait.
typename T::ResultType ResultType_
Alias declaration for nested ResultType type definitions.The ResultType_ alias declaration provides a...
Definition: Aliases.h:323
const ElementType_< MT > max(const DenseMatrix< MT, SO > &dm)
Returns the largest element of the dense matrix.
Definition: DenseMatrix.h:1716
EnableIf_< IsDenseMatrix< MT1 > > smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:129
TransposeType_< ResultType > TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:209
Constraint on the data type.
typename IfTrue< Condition, T1, T2 >::Type IfTrue_
Auxiliary alias declaration for the IfTrue class template.The IfTrue_ alias declaration provides a co...
Definition: If.h:109
bool isAligned() const noexcept
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:364
Compile time check for data types.This type trait tests whether or not the given types can be combine...
Definition: AreSIMDCombinable.h:121
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:72
SubvectorExprTrait_< VT, unaligned > subvector(Vector< VT, TF > &vector, size_t index, size_t size)
Creating a view on a specific subvector of the given vector.
Definition: Subvector.h:152
typename T::CompositeType CompositeType_
Alias declaration for nested CompositeType type definitions.The CompositeType_ alias declaration prov...
Definition: Aliases.h:83
ElementType_< ResultType > ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:210
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the If class template.
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:61
EnableIf_< IsDenseMatrix< MT1 > > smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exception.This macro encapsulates the default way of Bl...
Definition: Exception.h:331
Header file for the HasSIMDAdd type trait.
ResultType_< VT > VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:132
typename T::ElementType ElementType_
Alias declaration for nested ElementType type definitions.The ElementType_ alias declaration provides...
Definition: Aliases.h:163
Header file for all SIMD functionality.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:71
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:90
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix) noexcept
Returns the current number of columns of the matrix.
Definition: Matrix.h:330
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBLASCompatible.h:80
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:60
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:110
ElementType_< VRT > VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:134
Constraint on the data type.
Header file for the exception macros of the math module.
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
IfTrue_< evaluateMatrix, const MRT, MCT > LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:222
ElementType_< MRT > MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:133
Header file for all forward declarations for expression class templates.
size_t size() const noexcept
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:310
SIMDTrait_< ElementType > SIMDType
Resulting SIMD element type.
Definition: TDMatDVecMultExpr.h:211
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:265
Header file for the IsNumeric type trait.
BLAZE_ALWAYS_INLINE const EnableIf_< And< IsIntegral< T >, HasSize< T, 1UL > >, If_< IsSigned< T >, SIMDint8, SIMDuint8 > > set(T value) noexcept
Sets all values in the vector to the given 1-byte integral value.
Definition: Set.h:76
Header file for the HasConstDataAccess type trait.
DisableIf_< Or< IsComputation< MT >, IsTransExpr< MT > >, RowExprTrait_< MT > > row(Matrix< MT, SO > &matrix, size_t index)
Creating a view on a specific row of the given matrix.
Definition: Row.h:126
System settings for the BLAS mode.
Header file for the SubmatrixExprTrait class template.
Header file for the HasSIMDMult type trait.
LeftOperand leftOperand() const noexcept
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:320
bool isAliased(const T *alias) const noexcept
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:354
Header file for run time assertion macros.
typename If< T1, T2, T3 >::Type If_
Auxiliary alias declaration for the If class template.The If_ alias declaration provides a convenient...
Definition: If.h:160
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
EnableIf_< IsDenseVector< VT1 > > smpDivAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP division assignment of a vector to a dense vector.
Definition: DenseVector.h:222
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:61
Header file for the reset shim.
Constraints on the storage order of matrix types.
#define BLAZE_USE_BLAS_MATRIX_VECTOR_MULTIPLICATION
Compilation switch for the BLAS matrix/vector multiplication kernels (gemv).This compilation switch e...
Definition: BLAS.h:74
Header file for the HasMutableDataAccess type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:113
IntegralConstant< bool, B > BoolConstant
Generic wrapper for a compile time constant boolean value.The BoolConstant class template represents ...
Definition: IntegralConstant.h:100
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:61
Header file for the AreSIMDCombinable type trait.
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:55
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDMatDVecMultExpr.h:297
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
Header file for the IntegralConstant class template.
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
typename T::TransposeType TransposeType_
Alias declaration for nested TransposeType type definitions.The TransposeType_ alias declaration prov...
Definition: Aliases.h:403
Header file for the IsUpper type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:384
Header file for the MatVecMultExpr base class.
CompositeType_< VT > VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:136
TDMatDVecMultExpr(const MT &mat, const VT &vec) noexcept
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:251
Header file for the Size type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:385
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:213
If_< IsExpression< VT >, const VT, const VT & > RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:219
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:212
bool canAlias(const T *alias) const noexcept
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:342
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
RightOperand rightOperand() const noexcept
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:330
IfTrue_< evaluateVector, const VRT, VCT > RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:225