35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
111 template<
typename MT
144 template<
typename T1 >
145 struct UseSMPAssign {
146 enum { value = ( evaluateMatrix || evaluateVector ) };
157 template<
typename T1,
typename T2,
typename T3 >
158 struct UseSinglePrecisionKernel {
164 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
178 template<
typename T1,
typename T2,
typename T3 >
179 struct UseDoublePrecisionKernel {
185 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
199 template<
typename T1,
typename T2,
typename T3 >
200 struct UseSinglePrecisionComplexKernel {
201 typedef complex<float> Type;
207 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
221 template<
typename T1,
typename T2,
typename T3 >
222 struct UseDoublePrecisionComplexKernel {
223 typedef complex<double> Type;
229 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
242 template<
typename T1,
typename T2,
typename T3 >
243 struct UseDefaultKernel {
244 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
245 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
246 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
247 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
258 template<
typename T1,
typename T2,
typename T3 >
259 struct UseVectorizedDefaultKernel {
261 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
296 MT::vectorizable && VT::vectorizable &&
302 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
303 !evaluateVector && VT::smpAssignable };
331 mat_.columns() == 0UL )
335 return mat_(index,index) *
vec_[index];
342 :(
mat_.columns() ) );
345 const size_t jnum( jend - jbegin );
346 const size_t jpos( jbegin + ( ( jnum - 1UL ) &
size_t(-2) ) + 1UL );
348 ElementType res(
mat_(index,jbegin) *
vec_[jbegin] );
350 for(
size_t j=jbegin+1UL; j<jpos; j+=2UL ) {
354 res +=
mat_(index,jpos) *
vec_[jpos];
397 template<
typename T >
399 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
409 template<
typename T >
411 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
421 return mat_.isAligned() &&
vec_.isAligned();
457 template<
typename VT1 >
464 if( rhs.
mat_.rows() == 0UL ) {
467 else if( rhs.
mat_.columns() == 0UL ) {
480 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
496 template<
typename VT1
499 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
504 selectSmallAssignKernel( y, A, x );
506 selectBlasAssignKernel( y, A, x );
525 template<
typename VT1
528 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
549 template<
typename VT1
552 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
553 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
555 selectDefaultAssignKernel( y, A, x );
574 template<
typename VT1
577 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
578 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
580 typedef IntrinsicTrait<ElementType> IT;
582 const size_t M( A.rows() );
583 const size_t N( A.columns() );
587 for( ; (i+8UL) <= M; i+=8UL )
589 const size_t jbegin( ( IsUpper<MT1>::value )
590 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
592 const size_t jend( ( IsLower<MT1>::value )
593 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
597 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
599 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
600 const IntrinsicType x1( x.load(j) );
601 xmm1 = xmm1 + A.load(i ,j) * x1;
602 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
603 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
604 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
605 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
606 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
607 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
608 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
612 y[i+1UL] =
sum( xmm2 );
613 y[i+2UL] =
sum( xmm3 );
614 y[i+3UL] =
sum( xmm4 );
615 y[i+4UL] =
sum( xmm5 );
616 y[i+5UL] =
sum( xmm6 );
617 y[i+6UL] =
sum( xmm7 );
618 y[i+7UL] =
sum( xmm8 );
621 for( ; (i+4UL) <= M; i+=4UL )
623 const size_t jbegin( ( IsUpper<MT1>::value )
624 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
626 const size_t jend( ( IsLower<MT1>::value )
627 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
631 IntrinsicType xmm1, xmm2, xmm3, xmm4;
633 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
634 const IntrinsicType x1( x.load(j) );
635 xmm1 = xmm1 + A.load(i ,j) * x1;
636 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
637 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
638 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
642 y[i+1UL] =
sum( xmm2 );
643 y[i+2UL] =
sum( xmm3 );
644 y[i+3UL] =
sum( xmm4 );
647 for( ; (i+3UL) <= M; i+=3UL )
649 const size_t jbegin( ( IsUpper<MT1>::value )
650 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
652 const size_t jend( ( IsLower<MT1>::value )
653 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
657 IntrinsicType xmm1, xmm2, xmm3;
659 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
660 const IntrinsicType x1( x.load(j) );
661 xmm1 = xmm1 + A.load(i ,j) * x1;
662 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
663 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
667 y[i+1UL] =
sum( xmm2 );
668 y[i+2UL] =
sum( xmm3 );
671 for( ; (i+2UL) <= M; i+=2UL )
673 const size_t jbegin( ( IsUpper<MT1>::value )
674 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
676 const size_t jend( ( IsLower<MT1>::value )
677 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
681 IntrinsicType xmm1, xmm2;
683 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
684 const IntrinsicType x1( x.load(j) );
685 xmm1 = xmm1 + A.load(i ,j) * x1;
686 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
690 y[i+1UL] =
sum( xmm2 );
695 const size_t jbegin( ( IsUpper<MT1>::value )
696 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
698 const size_t jend( ( IsLower<MT1>::value )
699 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
705 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
706 xmm1 = xmm1 + A.load(i,j) * x.load(j);
729 template<
typename VT1
732 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
733 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
735 selectDefaultAssignKernel( y, A, x );
754 template<
typename VT1
757 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
758 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
760 typedef IntrinsicTrait<ElementType> IT;
762 const size_t M( A.rows() );
763 const size_t N( A.columns() );
769 for( ; (i+8UL) <= M; i+=8UL )
771 const size_t jbegin( ( IsUpper<MT1>::value )
772 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
774 const size_t jend( ( IsLower<MT1>::value )
775 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
785 const IntrinsicType x1( x.load(j ) );
786 const IntrinsicType x2( x.load(j1) );
787 const IntrinsicType x3( x.load(j2) );
788 const IntrinsicType x4( x.load(j3) );
789 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
790 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
791 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
792 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
793 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
794 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
795 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
796 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
801 const IntrinsicType x1( x.load(j ) );
802 const IntrinsicType x2( x.load(j1) );
803 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
804 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
805 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
806 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
807 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
808 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
809 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
810 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
814 const IntrinsicType x1( x.load(j) );
815 y[i ] +=
sum( A.load(i ,j) * x1 );
816 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
817 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
818 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
819 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
820 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
821 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
822 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
826 for( ; (i+4UL) <= M; i+=4UL )
828 const size_t jbegin( ( IsUpper<MT1>::value )
829 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
831 const size_t jend( ( IsLower<MT1>::value )
832 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
842 const IntrinsicType x1( x.load(j ) );
843 const IntrinsicType x2( x.load(j1) );
844 const IntrinsicType x3( x.load(j2) );
845 const IntrinsicType x4( x.load(j3) );
846 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
847 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
848 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
849 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
854 const IntrinsicType x1( x.load(j ) );
855 const IntrinsicType x2( x.load(j1) );
856 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
857 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
858 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
859 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
863 const IntrinsicType x1( x.load(j) );
864 y[i ] +=
sum( A.load(i ,j) * x1 );
865 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
866 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
867 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
871 for( ; (i+2UL) <= M; i+=2UL )
873 const size_t jbegin( ( IsUpper<MT1>::value )
874 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
876 const size_t jend( ( IsLower<MT1>::value )
877 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
887 const IntrinsicType x1( x.load(j ) );
888 const IntrinsicType x2( x.load(j1) );
889 const IntrinsicType x3( x.load(j2) );
890 const IntrinsicType x4( x.load(j3) );
891 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
892 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
897 const IntrinsicType x1( x.load(j ) );
898 const IntrinsicType x2( x.load(j1) );
899 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
900 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
904 const IntrinsicType x1( x.load(j) );
905 y[i ] +=
sum( A.load(i ,j) * x1 );
906 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
912 const size_t jbegin( ( IsUpper<MT1>::value )
913 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
915 const size_t jend( ( IsLower<MT1>::value )
916 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
926 const IntrinsicType x1( x.load(j ) );
927 const IntrinsicType x2( x.load(j1) );
928 const IntrinsicType x3( x.load(j2) );
929 const IntrinsicType x4( x.load(j3) );
930 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
935 const IntrinsicType x1( x.load(j ) );
936 const IntrinsicType x2( x.load(j1) );
937 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
941 const IntrinsicType x1( x.load(j) );
942 y[i] +=
sum( A.load(i,j) * x1 );
963 template<
typename VT1
966 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
967 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
969 selectLargeAssignKernel( y, A, x );
989 template<
typename VT1
992 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
993 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
995 if( IsTriangular<MT1>::value ) {
997 strmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1000 sgemv( y, A, x, 1.0F, 0.0F );
1022 template<
typename VT1
1025 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1026 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1028 if( IsTriangular<MT1>::value ) {
1030 dtrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1033 dgemv( y, A, x, 1.0, 0.0 );
1055 template<
typename VT1
1058 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1059 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1061 if( IsTriangular<MT1>::value ) {
1063 ctrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1066 cgemv( y, A, x, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
1088 template<
typename VT1
1091 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1092 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1094 if( IsTriangular<MT1>::value ) {
1096 ztrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1099 zgemv( y, A, x, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
1119 template<
typename VT1 >
1130 const ResultType tmp(
serial( rhs ) );
1149 template<
typename VT1 >
1156 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1160 LT A(
serial( rhs.mat_ ) );
1161 RT x(
serial( rhs.vec_ ) );
1168 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1184 template<
typename VT1
1187 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1189 if( ( IsDiagonal<MT1>::value ) ||
1190 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1192 selectSmallAddAssignKernel( y, A, x );
1194 selectBlasAddAssignKernel( y, A, x );
1213 template<
typename VT1
1216 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1218 y.addAssign( A * x );
1237 template<
typename VT1
1240 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1241 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1243 selectDefaultAddAssignKernel( y, A, x );
1262 template<
typename VT1
1265 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1266 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1268 typedef IntrinsicTrait<ElementType> IT;
1270 const size_t M( A.rows() );
1271 const size_t N( A.columns() );
1275 for( ; (i+8UL) <= M; i+=8UL )
1277 const size_t jbegin( ( IsUpper<MT1>::value )
1278 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1280 const size_t jend( ( IsLower<MT1>::value )
1281 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1285 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1287 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1288 const IntrinsicType x1( x.load(j) );
1289 xmm1 = xmm1 + A.load(i ,j) * x1;
1290 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1291 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1292 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1293 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1294 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1295 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1296 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1299 y[i ] +=
sum( xmm1 );
1300 y[i+1UL] +=
sum( xmm2 );
1301 y[i+2UL] +=
sum( xmm3 );
1302 y[i+3UL] +=
sum( xmm4 );
1303 y[i+4UL] +=
sum( xmm5 );
1304 y[i+5UL] +=
sum( xmm6 );
1305 y[i+6UL] +=
sum( xmm7 );
1306 y[i+7UL] +=
sum( xmm8 );
1309 for( ; (i+4UL) <= M; i+=4UL )
1311 const size_t jbegin( ( IsUpper<MT1>::value )
1312 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1314 const size_t jend( ( IsLower<MT1>::value )
1315 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1319 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1321 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1322 const IntrinsicType x1( x.load(j) );
1323 xmm1 = xmm1 + A.load(i ,j) * x1;
1324 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1325 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1326 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1329 y[i ] +=
sum( xmm1 );
1330 y[i+1UL] +=
sum( xmm2 );
1331 y[i+2UL] +=
sum( xmm3 );
1332 y[i+3UL] +=
sum( xmm4 );
1335 for( ; (i+3UL) <= M; i+=3UL )
1337 const size_t jbegin( ( IsUpper<MT1>::value )
1338 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1340 const size_t jend( ( IsLower<MT1>::value )
1341 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1345 IntrinsicType xmm1, xmm2, xmm3;
1347 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1348 const IntrinsicType x1( x.load(j) );
1349 xmm1 = xmm1 + A.load(i ,j) * x1;
1350 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1351 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1354 y[i ] +=
sum( xmm1 );
1355 y[i+1UL] +=
sum( xmm2 );
1356 y[i+2UL] +=
sum( xmm3 );
1359 for( ; (i+2UL) <= M; i+=2UL )
1361 const size_t jbegin( ( IsUpper<MT1>::value )
1362 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1364 const size_t jend( ( IsLower<MT1>::value )
1365 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1369 IntrinsicType xmm1, xmm2;
1371 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1372 const IntrinsicType x1( x.load(j) );
1373 xmm1 = xmm1 + A.load(i ,j) * x1;
1374 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1377 y[i ] +=
sum( xmm1 );
1378 y[i+1UL] +=
sum( xmm2 );
1383 const size_t jbegin( ( IsUpper<MT1>::value )
1384 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1386 const size_t jend( ( IsLower<MT1>::value )
1387 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1393 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1394 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1397 y[i] +=
sum( xmm1 );
1417 template<
typename VT1
1420 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1421 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1423 selectDefaultAddAssignKernel( y, A, x );
1442 template<
typename VT1
1445 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1446 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1448 typedef IntrinsicTrait<ElementType> IT;
1450 const size_t M( A.rows() );
1451 const size_t N( A.columns() );
1455 for( ; (i+8UL) <= M; i+=8UL )
1457 const size_t jbegin( ( IsUpper<MT1>::value )
1458 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1460 const size_t jend( ( IsLower<MT1>::value )
1461 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1471 const IntrinsicType x1( x.load(j ) );
1472 const IntrinsicType x2( x.load(j1) );
1473 const IntrinsicType x3( x.load(j2) );
1474 const IntrinsicType x4( x.load(j3) );
1475 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1476 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1477 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1478 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1479 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1480 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1481 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1482 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1487 const IntrinsicType x1( x.load(j ) );
1488 const IntrinsicType x2( x.load(j1) );
1489 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1490 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1491 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1492 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1493 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1494 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1495 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1496 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1500 const IntrinsicType x1( x.load(j) );
1501 y[i ] +=
sum( A.load(i ,j) * x1 );
1502 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1503 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1504 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1505 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1506 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1507 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1508 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1512 for( ; (i+4UL) <= M; i+=4UL )
1514 const size_t jbegin( ( IsUpper<MT1>::value )
1515 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1517 const size_t jend( ( IsLower<MT1>::value )
1518 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1528 const IntrinsicType x1( x.load(j ) );
1529 const IntrinsicType x2( x.load(j1) );
1530 const IntrinsicType x3( x.load(j2) );
1531 const IntrinsicType x4( x.load(j3) );
1532 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1533 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1534 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1535 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1540 const IntrinsicType x1( x.load(j ) );
1541 const IntrinsicType x2( x.load(j1) );
1542 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1543 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1544 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1545 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1549 const IntrinsicType x1( x.load(j) );
1550 y[i ] +=
sum( A.load(i ,j) * x1 );
1551 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1552 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1553 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1557 for( ; (i+2UL) <= M; i+=2UL )
1559 const size_t jbegin( ( IsUpper<MT1>::value )
1560 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1562 const size_t jend( ( IsLower<MT1>::value )
1563 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1573 const IntrinsicType x1( x.load(j ) );
1574 const IntrinsicType x2( x.load(j1) );
1575 const IntrinsicType x3( x.load(j2) );
1576 const IntrinsicType x4( x.load(j3) );
1577 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1578 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1583 const IntrinsicType x1( x.load(j ) );
1584 const IntrinsicType x2( x.load(j1) );
1585 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1586 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1590 const IntrinsicType x1( x.load(j) );
1591 y[i ] +=
sum( A.load(i ,j) * x1 );
1592 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1598 const size_t jbegin( ( IsUpper<MT1>::value )
1599 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1601 const size_t jend( ( IsLower<MT1>::value )
1602 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1612 const IntrinsicType x1( x.load(j ) );
1613 const IntrinsicType x2( x.load(j1) );
1614 const IntrinsicType x3( x.load(j2) );
1615 const IntrinsicType x4( x.load(j3) );
1616 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1621 const IntrinsicType x1( x.load(j ) );
1622 const IntrinsicType x2( x.load(j1) );
1623 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1627 const IntrinsicType x1( x.load(j) );
1628 y[i] +=
sum( A.load(i,j) * x1 );
1649 template<
typename VT1
1652 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1653 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1655 selectLargeAddAssignKernel( y, A, x );
1675 template<
typename VT1
1678 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1679 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1681 if( IsTriangular<MT1>::value ) {
1683 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1687 sgemv( y, A, x, 1.0F, 1.0F );
1709 template<
typename VT1
1712 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1713 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1715 if( IsTriangular<MT1>::value ) {
1717 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1721 dgemv( y, A, x, 1.0, 1.0 );
1743 template<
typename VT1
1746 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1747 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1749 if( IsTriangular<MT1>::value ) {
1751 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1755 cgemv( y, A, x, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
1777 template<
typename VT1
1780 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1781 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1783 if( IsTriangular<MT1>::value ) {
1785 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1789 zgemv( y, A, x, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
1813 template<
typename VT1 >
1820 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1824 LT A(
serial( rhs.mat_ ) );
1825 RT x(
serial( rhs.vec_ ) );
1832 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1848 template<
typename VT1
1851 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1853 if( ( IsDiagonal<MT1>::value ) ||
1854 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1856 selectSmallSubAssignKernel( y, A, x );
1858 selectBlasSubAssignKernel( y, A, x );
1877 template<
typename VT1
1880 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1882 y.subAssign( A * x );
1901 template<
typename VT1
1904 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1905 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1907 selectDefaultSubAssignKernel( y, A, x );
1926 template<
typename VT1
1929 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1930 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1932 typedef IntrinsicTrait<ElementType> IT;
1934 const size_t M( A.rows() );
1935 const size_t N( A.columns() );
1939 for( ; (i+8UL) <= M; i+=8UL )
1941 const size_t jbegin( ( IsUpper<MT1>::value )
1942 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1944 const size_t jend( ( IsLower<MT1>::value )
1945 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1949 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1951 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1952 const IntrinsicType x1( x.load(j) );
1953 xmm1 = xmm1 + A.load(i ,j) * x1;
1954 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1955 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1956 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1957 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1958 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1959 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1960 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1963 y[i ] -=
sum( xmm1 );
1964 y[i+1UL] -=
sum( xmm2 );
1965 y[i+2UL] -=
sum( xmm3 );
1966 y[i+3UL] -=
sum( xmm4 );
1967 y[i+4UL] -=
sum( xmm5 );
1968 y[i+5UL] -=
sum( xmm6 );
1969 y[i+6UL] -=
sum( xmm7 );
1970 y[i+7UL] -=
sum( xmm8 );
1973 for( ; (i+4UL) <= M; i+=4UL )
1975 const size_t jbegin( ( IsUpper<MT1>::value )
1976 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1978 const size_t jend( ( IsLower<MT1>::value )
1979 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1983 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1985 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
1986 const IntrinsicType x1( x.load(j) );
1987 xmm1 = xmm1 + A.load(i ,j) * x1;
1988 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1989 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1990 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1993 y[i ] -=
sum( xmm1 );
1994 y[i+1UL] -=
sum( xmm2 );
1995 y[i+2UL] -=
sum( xmm3 );
1996 y[i+3UL] -=
sum( xmm4 );
1999 for( ; (i+3UL) <= M; i+=3UL )
2001 const size_t jbegin( ( IsUpper<MT1>::value )
2002 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2004 const size_t jend( ( IsLower<MT1>::value )
2005 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
2009 IntrinsicType xmm1, xmm2, xmm3;
2011 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
2012 const IntrinsicType x1( x.load(j) );
2013 xmm1 = xmm1 + A.load(i ,j) * x1;
2014 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2015 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2018 y[i ] -=
sum( xmm1 );
2019 y[i+1UL] -=
sum( xmm2 );
2020 y[i+2UL] -=
sum( xmm3 );
2023 for( ; (i+2UL) <= M; i+=2UL )
2025 const size_t jbegin( ( IsUpper<MT1>::value )
2026 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2028 const size_t jend( ( IsLower<MT1>::value )
2029 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2033 IntrinsicType xmm1, xmm2;
2035 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
2036 const IntrinsicType x1( x.load(j) );
2037 xmm1 = xmm1 + A.load(i ,j) * x1;
2038 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2041 y[i ] -=
sum( xmm1 );
2042 y[i+1UL] -=
sum( xmm2 );
2047 const size_t jbegin( ( IsUpper<MT1>::value )
2048 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2050 const size_t jend( ( IsLower<MT1>::value )
2051 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2057 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
2058 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2061 y[i] -=
sum( xmm1 );
2081 template<
typename VT1
2084 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2085 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2087 selectDefaultSubAssignKernel( y, A, x );
2106 template<
typename VT1
2109 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2110 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2112 typedef IntrinsicTrait<ElementType> IT;
2114 const size_t M( A.rows() );
2115 const size_t N( A.columns() );
2119 for( ; (i+8UL) <= M; i+=8UL )
2121 const size_t jbegin( ( IsUpper<MT1>::value )
2122 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2124 const size_t jend( ( IsLower<MT1>::value )
2125 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2135 const IntrinsicType x1( x.load(j ) );
2136 const IntrinsicType x2( x.load(j1) );
2137 const IntrinsicType x3( x.load(j2) );
2138 const IntrinsicType x4( x.load(j3) );
2139 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2140 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2141 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2142 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2143 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2144 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2145 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2146 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2151 const IntrinsicType x1( x.load(j ) );
2152 const IntrinsicType x2( x.load(j1) );
2153 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2154 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2155 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2156 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2157 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2158 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2159 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2160 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2164 const IntrinsicType x1( x.load(j) );
2165 y[i ] -=
sum( A.load(i ,j) * x1 );
2166 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2167 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2168 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2169 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2170 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2171 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2172 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2176 for( ; (i+4UL) <= M; i+=4UL )
2178 const size_t jbegin( ( IsUpper<MT1>::value )
2179 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2181 const size_t jend( ( IsLower<MT1>::value )
2182 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2192 const IntrinsicType x1( x.load(j ) );
2193 const IntrinsicType x2( x.load(j1) );
2194 const IntrinsicType x3( x.load(j2) );
2195 const IntrinsicType x4( x.load(j3) );
2196 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2197 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2198 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2199 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2204 const IntrinsicType x1( x.load(j ) );
2205 const IntrinsicType x2( x.load(j1) );
2206 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2207 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2208 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2209 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2213 const IntrinsicType x1( x.load(j) );
2214 y[i ] -=
sum( A.load(i ,j) * x1 );
2215 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2216 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2217 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2221 for( ; (i+2UL) <= M; i+=2UL )
2223 const size_t jbegin( ( IsUpper<MT1>::value )
2224 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2226 const size_t jend( ( IsLower<MT1>::value )
2227 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2237 const IntrinsicType x1( x.load(j ) );
2238 const IntrinsicType x2( x.load(j1) );
2239 const IntrinsicType x3( x.load(j2) );
2240 const IntrinsicType x4( x.load(j3) );
2241 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2242 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2247 const IntrinsicType x1( x.load(j ) );
2248 const IntrinsicType x2( x.load(j1) );
2249 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2250 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2254 const IntrinsicType x1( x.load(j) );
2255 y[i ] -=
sum( A.load(i ,j) * x1 );
2256 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2262 const size_t jbegin( ( IsUpper<MT1>::value )
2263 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2265 const size_t jend( ( IsLower<MT1>::value )
2266 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2276 const IntrinsicType x1( x.load(j ) );
2277 const IntrinsicType x2( x.load(j1) );
2278 const IntrinsicType x3( x.load(j2) );
2279 const IntrinsicType x4( x.load(j3) );
2280 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2285 const IntrinsicType x1( x.load(j ) );
2286 const IntrinsicType x2( x.load(j1) );
2287 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2291 const IntrinsicType x1( x.load(j) );
2292 y[i] -=
sum( A.load(i,j) * x1 );
2313 template<
typename VT1
2316 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
2317 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2319 selectLargeSubAssignKernel( y, A, x );
2339 template<
typename VT1
2342 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
2343 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2345 if( IsTriangular<MT1>::value ) {
2347 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2351 sgemv( y, A, x, -1.0F, 1.0F );
2373 template<
typename VT1
2376 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
2377 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2379 if( IsTriangular<MT1>::value ) {
2381 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2385 dgemv( y, A, x, -1.0, 1.0 );
2407 template<
typename VT1
2410 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2411 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2413 if( IsTriangular<MT1>::value ) {
2415 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2419 cgemv( y, A, x, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
2441 template<
typename VT1
2444 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2445 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2447 if( IsTriangular<MT1>::value ) {
2449 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2453 zgemv( y, A, x, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
2477 template<
typename VT1 >
2488 const ResultType tmp(
serial( rhs ) );
2513 template<
typename VT1 >
2514 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2521 if( rhs.mat_.rows() == 0UL ) {
2524 else if( rhs.mat_.columns() == 0UL ) {
2557 template<
typename VT1 >
2558 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2569 const ResultType tmp( rhs );
2590 template<
typename VT1 >
2591 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2598 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2634 template<
typename VT1 >
2635 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2642 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2678 template<
typename VT1 >
2679 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2690 const ResultType tmp( rhs );
2729 template<
typename MT
2733 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2734 ,
private VecScalarMultExpr
2735 ,
private Computation
2739 typedef DMatDVecMultExpr<MT,VT> MVM;
2751 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2752 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2757 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2765 template<
typename T1 >
2766 struct UseSMPAssign {
2767 enum { value = ( evaluateMatrix || evaluateVector ) };
2776 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2777 struct UseSinglePrecisionKernel {
2779 HasMutableDataAccess<T1>::value &&
2780 HasConstDataAccess<T2>::value &&
2781 HasConstDataAccess<T3>::value &&
2782 !IsDiagonal<T2>::value &&
2783 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2784 IsFloat<typename T1::ElementType>::value &&
2785 IsFloat<typename T2::ElementType>::value &&
2786 IsFloat<typename T3::ElementType>::value &&
2787 !IsComplex<T4>::value };
2796 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2797 struct UseDoublePrecisionKernel {
2799 HasMutableDataAccess<T1>::value &&
2800 HasConstDataAccess<T2>::value &&
2801 HasConstDataAccess<T3>::value &&
2802 !IsDiagonal<T2>::value &&
2803 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2804 IsDouble<typename T1::ElementType>::value &&
2805 IsDouble<typename T2::ElementType>::value &&
2806 IsDouble<typename T3::ElementType>::value &&
2807 !IsComplex<T4>::value };
2816 template<
typename T1,
typename T2,
typename T3 >
2817 struct UseSinglePrecisionComplexKernel {
2818 typedef complex<float> Type;
2820 HasMutableDataAccess<T1>::value &&
2821 HasConstDataAccess<T2>::value &&
2822 HasConstDataAccess<T3>::value &&
2823 !IsDiagonal<T2>::value &&
2824 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2825 IsSame<typename T1::ElementType,Type>::value &&
2826 IsSame<typename T2::ElementType,Type>::value &&
2827 IsSame<typename T3::ElementType,Type>::value };
2836 template<
typename T1,
typename T2,
typename T3 >
2837 struct UseDoublePrecisionComplexKernel {
2838 typedef complex<double> Type;
2840 HasMutableDataAccess<T1>::value &&
2841 HasConstDataAccess<T2>::value &&
2842 HasConstDataAccess<T3>::value &&
2843 !IsDiagonal<T2>::value &&
2844 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2845 IsSame<typename T1::ElementType,Type>::value &&
2846 IsSame<typename T2::ElementType,Type>::value &&
2847 IsSame<typename T3::ElementType,Type>::value };
2855 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2856 struct UseDefaultKernel {
2857 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2858 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2859 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2860 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2869 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2870 struct UseVectorizedDefaultKernel {
2871 enum { value = !IsDiagonal<T2>::value &&
2872 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2873 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2874 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2875 IsSame<typename T1::ElementType,T4>::value &&
2876 IntrinsicTrait<typename T1::ElementType>::addition &&
2877 IntrinsicTrait<typename T1::ElementType>::multiplication };
2883 typedef DVecScalarMultExpr<MVM,ST,false>
This;
2884 typedef typename MultTrait<RES,ST>::Type
ResultType;
2887 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2892 typedef const DMatDVecMultExpr<MT,VT>
LeftOperand;
2898 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
2901 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
2906 enum { vectorizable = !IsDiagonal<MT>::value &&
2907 MT::vectorizable && VT::vectorizable &&
2908 IsSame<MET,VET>::value &&
2909 IsSame<MET,ST>::value &&
2910 IntrinsicTrait<MET>::addition &&
2911 IntrinsicTrait<MET>::multiplication };
2914 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2915 !evaluateVector && VT::smpAssignable };
2924 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2936 inline ReturnType
operator[](
size_t index )
const {
2938 return vector_[index] * scalar_;
2947 inline size_t size()
const {
2948 return vector_.size();
2978 template<
typename T >
2979 inline bool canAlias(
const T* alias )
const {
2980 return vector_.canAlias( alias );
2990 template<
typename T >
2991 inline bool isAliased(
const T* alias )
const {
2992 return vector_.isAliased( alias );
3002 return vector_.isAligned();
3012 typename MVM::LeftOperand A( vector_.leftOperand() );
3014 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3022 LeftOperand vector_;
3023 RightOperand scalar_;
3038 template<
typename VT1 >
3039 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3045 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3046 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3048 if( left.rows() == 0UL ) {
3051 else if( left.columns() == 0UL ) {
3064 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
3079 template<
typename VT1
3083 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3085 if( ( IsDiagonal<MT1>::value ) ||
3086 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3088 selectSmallAssignKernel( y, A, x, scalar );
3090 selectBlasAssignKernel( y, A, x, scalar );
3108 template<
typename VT1
3112 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3113 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3115 y.assign( A * x * scalar );
3133 template<
typename VT1
3137 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3138 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3140 selectDefaultAssignKernel( y, A, x, scalar );
3158 template<
typename VT1
3162 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3163 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3165 typedef IntrinsicTrait<ElementType> IT;
3167 const size_t M( A.rows() );
3168 const size_t N( A.columns() );
3172 for( ; (i+8UL) <= M; i+=8UL )
3174 const size_t jbegin( ( IsUpper<MT1>::value )
3175 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3177 const size_t jend( ( IsLower<MT1>::value )
3178 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3182 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3184 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3185 const IntrinsicType x1( x.load(j) );
3186 xmm1 = xmm1 + A.load(i ,j) * x1;
3187 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3188 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3189 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3190 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3191 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3192 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3193 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3196 y[i ] =
sum( xmm1 ) * scalar;
3197 y[i+1UL] =
sum( xmm2 ) * scalar;
3198 y[i+2UL] =
sum( xmm3 ) * scalar;
3199 y[i+3UL] =
sum( xmm4 ) * scalar;
3200 y[i+4UL] =
sum( xmm5 ) * scalar;
3201 y[i+5UL] =
sum( xmm6 ) * scalar;
3202 y[i+6UL] =
sum( xmm7 ) * scalar;
3203 y[i+7UL] =
sum( xmm8 ) * scalar;
3206 for( ; (i+4UL) <= M; i+=4UL )
3208 const size_t jbegin( ( IsUpper<MT1>::value )
3209 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3211 const size_t jend( ( IsLower<MT1>::value )
3212 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3216 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3218 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3219 const IntrinsicType x1( x.load(j) );
3220 xmm1 = xmm1 + A.load(i ,j) * x1;
3221 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3222 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3223 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3226 y[i ] =
sum( xmm1 ) * scalar;
3227 y[i+1UL] =
sum( xmm2 ) * scalar;
3228 y[i+2UL] =
sum( xmm3 ) * scalar;
3229 y[i+3UL] =
sum( xmm4 ) * scalar;
3232 for( ; (i+3UL) <= M; i+=3UL )
3234 const size_t jbegin( ( IsUpper<MT1>::value )
3235 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3237 const size_t jend( ( IsLower<MT1>::value )
3238 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3242 IntrinsicType xmm1, xmm2, xmm3;
3244 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3245 const IntrinsicType x1( x.load(j) );
3246 xmm1 = xmm1 + A.load(i ,j) * x1;
3247 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3248 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3251 y[i ] =
sum( xmm1 ) * scalar;
3252 y[i+1UL] =
sum( xmm2 ) * scalar;
3253 y[i+2UL] =
sum( xmm3 ) * scalar;
3256 for( ; (i+2UL) <= M; i+=2UL )
3258 const size_t jbegin( ( IsUpper<MT1>::value )
3259 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3261 const size_t jend( ( IsLower<MT1>::value )
3262 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3266 IntrinsicType xmm1, xmm2;
3268 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3269 const IntrinsicType x1( x.load(j) );
3270 xmm1 = xmm1 + A.load(i ,j) * x1;
3271 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3274 y[i ] =
sum( xmm1 ) * scalar;
3275 y[i+1UL] =
sum( xmm2 ) * scalar;
3280 const size_t jbegin( ( IsUpper<MT1>::value )
3281 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3283 const size_t jend( ( IsLower<MT1>::value )
3284 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3290 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3291 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3294 y[i] =
sum( xmm1 ) * scalar;
3313 template<
typename VT1
3317 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3318 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3320 selectDefaultAssignKernel( y, A, x, scalar );
3338 template<
typename VT1
3342 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3343 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3345 typedef IntrinsicTrait<ElementType> IT;
3347 const size_t M( A.rows() );
3348 const size_t N( A.columns() );
3354 for( ; (i+8UL) <= M; i+=8UL )
3356 const size_t jbegin( ( IsUpper<MT1>::value )
3357 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3359 const size_t jend( ( IsLower<MT1>::value )
3360 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3370 const IntrinsicType x1( x.load(j ) );
3371 const IntrinsicType x2( x.load(j1) );
3372 const IntrinsicType x3( x.load(j2) );
3373 const IntrinsicType x4( x.load(j3) );
3374 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3375 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3376 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3377 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3378 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3379 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3380 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3381 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3386 const IntrinsicType x1( x.load(j ) );
3387 const IntrinsicType x2( x.load(j1) );
3388 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3389 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3390 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3391 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3392 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3393 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3394 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3395 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3399 const IntrinsicType x1( x.load(j) );
3400 y[i ] +=
sum( A.load(i ,j) * x1 );
3401 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3402 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3403 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3404 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3405 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3406 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3407 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3420 for( ; (i+4UL) <= M; i+=4UL )
3422 const size_t jbegin( ( IsUpper<MT1>::value )
3423 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3425 const size_t jend( ( IsLower<MT1>::value )
3426 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3436 const IntrinsicType x1( x.load(j ) );
3437 const IntrinsicType x2( x.load(j1) );
3438 const IntrinsicType x3( x.load(j2) );
3439 const IntrinsicType x4( x.load(j3) );
3440 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3441 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3442 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3443 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3448 const IntrinsicType x1( x.load(j ) );
3449 const IntrinsicType x2( x.load(j1) );
3450 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3451 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3452 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3453 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3457 const IntrinsicType x1( x.load(j) );
3458 y[i ] +=
sum( A.load(i ,j) * x1 );
3459 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3460 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3461 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3470 for( ; (i+2UL) <= M; i+=2UL )
3472 const size_t jbegin( ( IsUpper<MT1>::value )
3473 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3475 const size_t jend( ( IsLower<MT1>::value )
3476 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3486 const IntrinsicType x1( x.load(j ) );
3487 const IntrinsicType x2( x.load(j1) );
3488 const IntrinsicType x3( x.load(j2) );
3489 const IntrinsicType x4( x.load(j3) );
3490 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3491 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3496 const IntrinsicType x1( x.load(j ) );
3497 const IntrinsicType x2( x.load(j1) );
3498 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3499 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3503 const IntrinsicType x1( x.load(j) );
3504 y[i ] +=
sum( A.load(i ,j) * x1 );
3505 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3514 const size_t jbegin( ( IsUpper<MT1>::value )
3515 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3517 const size_t jend( ( IsLower<MT1>::value )
3518 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3528 const IntrinsicType x1( x.load(j ) );
3529 const IntrinsicType x2( x.load(j1) );
3530 const IntrinsicType x3( x.load(j2) );
3531 const IntrinsicType x4( x.load(j3) );
3532 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3537 const IntrinsicType x1( x.load(j ) );
3538 const IntrinsicType x2( x.load(j1) );
3539 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3543 const IntrinsicType x1( x.load(j) );
3544 y[i] +=
sum( A.load(i,j) * x1 );
3566 template<
typename VT1
3570 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3571 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3573 selectLargeAssignKernel( y, A, x, scalar );
3592 template<
typename VT1
3596 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3597 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3599 if( IsTriangular<MT1>::value ) {
3601 strmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3604 sgemv( y, A, x, scalar, 0.0F );
3625 template<
typename VT1
3629 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3630 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3632 if( IsTriangular<MT1>::value ) {
3634 dtrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3637 dgemv( y, A, x, scalar, 0.0 );
3658 template<
typename VT1
3662 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3663 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3665 if( IsTriangular<MT1>::value ) {
3667 ctrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3670 cgemv( y, A, x, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
3691 template<
typename VT1
3695 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3696 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3698 if( IsTriangular<MT1>::value ) {
3700 ztrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3703 zgemv( y, A, x, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
3721 template<
typename VT1 >
3722 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3732 const ResultType tmp(
serial( rhs ) );
3749 template<
typename VT1 >
3750 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3756 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3757 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3759 if( left.rows() == 0UL || left.columns() == 0UL ) {
3771 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3786 template<
typename VT1
3790 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3792 if( ( IsDiagonal<MT1>::value ) ||
3793 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3795 selectSmallAddAssignKernel( y, A, x, scalar );
3797 selectBlasAddAssignKernel( y, A, x, scalar );
3815 template<
typename VT1
3819 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3821 y.addAssign( A * x * scalar );
3839 template<
typename VT1
3843 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3844 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3846 selectDefaultAddAssignKernel( y, A, x, scalar );
3864 template<
typename VT1
3868 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3869 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3871 typedef IntrinsicTrait<ElementType> IT;
3873 const size_t M( A.rows() );
3874 const size_t N( A.columns() );
3878 for( ; (i+8UL) <= M; i+=8UL )
3880 const size_t jbegin( ( IsUpper<MT1>::value )
3881 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3883 const size_t jend( ( IsLower<MT1>::value )
3884 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3888 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3890 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3891 const IntrinsicType x1( x.load(j) );
3892 xmm1 = xmm1 + A.load(i ,j) * x1;
3893 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3894 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3895 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3896 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3897 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3898 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3899 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3902 y[i ] +=
sum( xmm1 ) * scalar;
3903 y[i+1UL] +=
sum( xmm2 ) * scalar;
3904 y[i+2UL] +=
sum( xmm3 ) * scalar;
3905 y[i+3UL] +=
sum( xmm4 ) * scalar;
3906 y[i+4UL] +=
sum( xmm5 ) * scalar;
3907 y[i+5UL] +=
sum( xmm6 ) * scalar;
3908 y[i+6UL] +=
sum( xmm7 ) * scalar;
3909 y[i+7UL] +=
sum( xmm8 ) * scalar;
3912 for( ; (i+4UL) <= M; i+=4UL )
3914 const size_t jbegin( ( IsUpper<MT1>::value )
3915 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3917 const size_t jend( ( IsLower<MT1>::value )
3918 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3922 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3924 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3925 const IntrinsicType x1( x.load(j) );
3926 xmm1 = xmm1 + A.load(i ,j) * x1;
3927 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3928 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3929 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3932 y[i ] +=
sum( xmm1 ) * scalar;
3933 y[i+1UL] +=
sum( xmm2 ) * scalar;
3934 y[i+2UL] +=
sum( xmm3 ) * scalar;
3935 y[i+3UL] +=
sum( xmm4 ) * scalar;
3938 for( ; (i+3UL) <= M; i+=3UL )
3940 const size_t jbegin( ( IsUpper<MT1>::value )
3941 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3943 const size_t jend( ( IsLower<MT1>::value )
3944 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3948 IntrinsicType xmm1, xmm2, xmm3;
3950 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3951 const IntrinsicType x1( x.load(j) );
3952 xmm1 = xmm1 + A.load(i ,j) * x1;
3953 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3954 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3957 y[i ] +=
sum( xmm1 ) * scalar;
3958 y[i+1UL] +=
sum( xmm2 ) * scalar;
3959 y[i+2UL] +=
sum( xmm3 ) * scalar;
3962 for( ; (i+2UL) <= M; i+=2UL )
3964 const size_t jbegin( ( IsUpper<MT1>::value )
3965 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3967 const size_t jend( ( IsLower<MT1>::value )
3968 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3972 IntrinsicType xmm1, xmm2;
3974 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3975 const IntrinsicType x1( x.load(j) );
3976 xmm1 = xmm1 + A.load(i ,j) * x1;
3977 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3980 y[i ] +=
sum( xmm1 ) * scalar;
3981 y[i+1UL] +=
sum( xmm2 ) * scalar;
3986 const size_t jbegin( ( IsUpper<MT1>::value )
3987 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3989 const size_t jend( ( IsLower<MT1>::value )
3990 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3996 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
3997 xmm1 = xmm1 + A.load(i,j) * x.load(j);
4000 y[i] +=
sum( xmm1 ) * scalar;
4019 template<
typename VT1
4023 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4024 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4026 selectDefaultAddAssignKernel( y, A, x, scalar );
4044 template<
typename VT1
4048 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4049 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4051 typedef IntrinsicTrait<ElementType> IT;
4053 const size_t M( A.rows() );
4054 const size_t N( A.columns() );
4058 for( ; (i+8UL) <= M; i+=8UL )
4060 const size_t jbegin( ( IsUpper<MT1>::value )
4061 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4063 const size_t jend( ( IsLower<MT1>::value )
4064 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4074 const IntrinsicType x1( x.load(j ) );
4075 const IntrinsicType x2( x.load(j1) );
4076 const IntrinsicType x3( x.load(j2) );
4077 const IntrinsicType x4( x.load(j3) );
4078 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4079 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4080 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4081 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4082 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4083 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4084 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4085 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4090 const IntrinsicType x1( x.load(j ) );
4091 const IntrinsicType x2( x.load(j1) );
4092 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4093 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4094 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4095 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4096 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4097 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4098 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4099 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4103 const IntrinsicType x1( x.load(j) );
4104 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4105 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4106 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4107 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4108 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4109 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4110 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4111 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4115 for( ; (i+4UL) <= M; i+=4UL )
4117 const size_t jbegin( ( IsUpper<MT1>::value )
4118 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4120 const size_t jend( ( IsLower<MT1>::value )
4121 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4131 const IntrinsicType x1( x.load(j ) );
4132 const IntrinsicType x2( x.load(j1) );
4133 const IntrinsicType x3( x.load(j2) );
4134 const IntrinsicType x4( x.load(j3) );
4135 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4136 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4137 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4138 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4143 const IntrinsicType x1( x.load(j ) );
4144 const IntrinsicType x2( x.load(j1) );
4145 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4146 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4147 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4148 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4152 const IntrinsicType x1( x.load(j) );
4153 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4154 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4155 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4156 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4160 for( ; (i+2UL) <= M; i+=2UL )
4162 const size_t jbegin( ( IsUpper<MT1>::value )
4163 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4165 const size_t jend( ( IsLower<MT1>::value )
4166 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4176 const IntrinsicType x1( x.load(j ) );
4177 const IntrinsicType x2( x.load(j1) );
4178 const IntrinsicType x3( x.load(j2) );
4179 const IntrinsicType x4( x.load(j3) );
4180 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4181 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4186 const IntrinsicType x1( x.load(j ) );
4187 const IntrinsicType x2( x.load(j1) );
4188 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4189 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4193 const IntrinsicType x1( x.load(j) );
4194 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4195 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4201 const size_t jbegin( ( IsUpper<MT1>::value )
4202 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4204 const size_t jend( ( IsLower<MT1>::value )
4205 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4215 const IntrinsicType x1( x.load(j ) );
4216 const IntrinsicType x2( x.load(j1) );
4217 const IntrinsicType x3( x.load(j2) );
4218 const IntrinsicType x4( x.load(j3) );
4219 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4224 const IntrinsicType x1( x.load(j ) );
4225 const IntrinsicType x2( x.load(j1) );
4226 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4230 const IntrinsicType x1( x.load(j) );
4231 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4251 template<
typename VT1
4255 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4256 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4258 selectLargeAddAssignKernel( y, A, x, scalar );
4277 template<
typename VT1
4281 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
4282 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4284 if( IsTriangular<MT1>::value ) {
4286 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4290 sgemv( y, A, x, scalar, 1.0F );
4311 template<
typename VT1
4315 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
4316 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4318 if( IsTriangular<MT1>::value ) {
4320 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4324 dgemv( y, A, x, scalar, 1.0 );
4345 template<
typename VT1
4349 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
4350 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4352 if( IsTriangular<MT1>::value ) {
4354 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4358 cgemv( y, A, x, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
4379 template<
typename VT1
4383 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
4384 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4386 if( IsTriangular<MT1>::value ) {
4388 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4392 zgemv( y, A, x, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
4414 template<
typename VT1 >
4415 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4421 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
4422 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
4424 if( left.rows() == 0UL || left.columns() == 0UL ) {
4436 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4451 template<
typename VT1
4455 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4457 if( ( IsDiagonal<MT1>::value ) ||
4458 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4460 selectSmallSubAssignKernel( y, A, x, scalar );
4462 selectBlasSubAssignKernel( y, A, x, scalar );
4480 template<
typename VT1
4484 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4486 y.subAssign( A * x * scalar );
4504 template<
typename VT1
4508 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4509 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4511 selectDefaultSubAssignKernel( y, A, x, scalar );
4529 template<
typename VT1
4533 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4534 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4536 typedef IntrinsicTrait<ElementType> IT;
4538 const size_t M( A.rows() );
4539 const size_t N( A.columns() );
4543 for( ; (i+8UL) <= M; i+=8UL )
4545 const size_t jbegin( ( IsUpper<MT1>::value )
4546 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4548 const size_t jend( ( IsLower<MT1>::value )
4549 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4553 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4555 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
4556 const IntrinsicType x1( x.load(j) );
4557 xmm1 = xmm1 + A.load(i ,j) * x1;
4558 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4559 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4560 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4561 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
4562 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
4563 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
4564 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
4567 y[i ] -=
sum( xmm1 ) * scalar;
4568 y[i+1UL] -=
sum( xmm2 ) * scalar;
4569 y[i+2UL] -=
sum( xmm3 ) * scalar;
4570 y[i+3UL] -=
sum( xmm4 ) * scalar;
4571 y[i+4UL] -=
sum( xmm5 ) * scalar;
4572 y[i+5UL] -=
sum( xmm6 ) * scalar;
4573 y[i+6UL] -=
sum( xmm7 ) * scalar;
4574 y[i+7UL] -=
sum( xmm8 ) * scalar;
4577 for( ; (i+4UL) <= M; i+=4UL )
4579 const size_t jbegin( ( IsUpper<MT1>::value )
4580 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4582 const size_t jend( ( IsLower<MT1>::value )
4583 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4587 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4589 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
4590 const IntrinsicType x1( x.load(j) );
4591 xmm1 = xmm1 + A.load(i ,j) * x1;
4592 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4593 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4594 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4597 y[i ] -=
sum( xmm1 ) * scalar;
4598 y[i+1UL] -=
sum( xmm2 ) * scalar;
4599 y[i+2UL] -=
sum( xmm3 ) * scalar;
4600 y[i+3UL] -=
sum( xmm4 ) * scalar;
4603 for( ; (i+3UL) <= M; i+=3UL )
4605 const size_t jbegin( ( IsUpper<MT1>::value )
4606 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4608 const size_t jend( ( IsLower<MT1>::value )
4609 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4613 IntrinsicType xmm1, xmm2, xmm3;
4615 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
4616 const IntrinsicType x1( x.load(j) );
4617 xmm1 = xmm1 + A.load(i ,j) * x1;
4618 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4619 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4622 y[i ] -=
sum( xmm1 ) * scalar;
4623 y[i+1UL] -=
sum( xmm2 ) * scalar;
4624 y[i+2UL] -=
sum( xmm3 ) * scalar;
4627 for( ; (i+2UL) <= M; i+=2UL )
4629 const size_t jbegin( ( IsUpper<MT1>::value )
4630 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4632 const size_t jend( ( IsLower<MT1>::value )
4633 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4637 IntrinsicType xmm1, xmm2;
4639 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
4640 const IntrinsicType x1( x.load(j) );
4641 xmm1 = xmm1 + A.load(i ,j) * x1;
4642 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4645 y[i ] -=
sum( xmm1 ) * scalar;
4646 y[i+1UL] -=
sum( xmm2 ) * scalar;
4651 const size_t jbegin( ( IsUpper<MT1>::value )
4652 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4654 const size_t jend( ( IsLower<MT1>::value )
4655 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4661 for(
size_t j=jbegin; j<jend; j+=
IT::size ) {
4662 xmm1 = xmm1 + A.load(i,j) * x.load(j);
4665 y[i] -=
sum( xmm1 ) * scalar;
4684 template<
typename VT1
4688 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4689 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4691 selectDefaultSubAssignKernel( y, A, x, scalar );
4709 template<
typename VT1
4713 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4714 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4716 typedef IntrinsicTrait<ElementType> IT;
4718 const size_t M( A.rows() );
4719 const size_t N( A.columns() );
4723 for( ; (i+8UL) <= M; i+=8UL )
4725 const size_t jbegin( ( IsUpper<MT1>::value )
4726 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4728 const size_t jend( ( IsLower<MT1>::value )
4729 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4739 const IntrinsicType x1( x.load(j ) );
4740 const IntrinsicType x2( x.load(j1) );
4741 const IntrinsicType x3( x.load(j2) );
4742 const IntrinsicType x4( x.load(j3) );
4743 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4744 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4745 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4746 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4747 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4748 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4749 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4750 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4755 const IntrinsicType x1( x.load(j ) );
4756 const IntrinsicType x2( x.load(j1) );
4757 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4758 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4759 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4760 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4761 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4762 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4763 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4764 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4768 const IntrinsicType x1( x.load(j) );
4769 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4770 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4771 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4772 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4773 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4774 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4775 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4776 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4780 for( ; (i+4UL) <= M; i+=4UL )
4782 const size_t jbegin( ( IsUpper<MT1>::value )
4783 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4785 const size_t jend( ( IsLower<MT1>::value )
4786 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4796 const IntrinsicType x1( x.load(j ) );
4797 const IntrinsicType x2( x.load(j1) );
4798 const IntrinsicType x3( x.load(j2) );
4799 const IntrinsicType x4( x.load(j3) );
4800 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4801 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4802 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4803 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4808 const IntrinsicType x1( x.load(j ) );
4809 const IntrinsicType x2( x.load(j1) );
4810 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4812 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4813 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4817 const IntrinsicType x1( x.load(j) );
4818 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4819 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4820 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4821 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4825 for( ; (i+2UL) <= M; i+=2UL )
4827 const size_t jbegin( ( IsUpper<MT1>::value )
4828 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4830 const size_t jend( ( IsLower<MT1>::value )
4831 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4841 const IntrinsicType x1( x.load(j ) );
4842 const IntrinsicType x2( x.load(j1) );
4843 const IntrinsicType x3( x.load(j2) );
4844 const IntrinsicType x4( x.load(j3) );
4845 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4846 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4851 const IntrinsicType x1( x.load(j ) );
4852 const IntrinsicType x2( x.load(j1) );
4853 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4854 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4858 const IntrinsicType x1( x.load(j) );
4859 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4860 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4866 const size_t jbegin( ( IsUpper<MT1>::value )
4867 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4869 const size_t jend( ( IsLower<MT1>::value )
4870 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4880 const IntrinsicType x1( x.load(j ) );
4881 const IntrinsicType x2( x.load(j1) );
4882 const IntrinsicType x3( x.load(j2) );
4883 const IntrinsicType x4( x.load(j3) );
4884 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4889 const IntrinsicType x1( x.load(j ) );
4890 const IntrinsicType x2( x.load(j1) );
4891 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4895 const IntrinsicType x1( x.load(j) );
4896 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4916 template<
typename VT1
4920 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4921 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4923 selectLargeSubAssignKernel( y, A, x, scalar );
4942 template<
typename VT1
4946 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
4947 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4949 if( IsTriangular<MT1>::value ) {
4951 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4955 sgemv( y, A, x, -scalar, 1.0F );
4976 template<
typename VT1
4980 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
4981 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4983 if( IsTriangular<MT1>::value ) {
4985 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4989 dgemv( y, A, x, -scalar, 1.0 );
5010 template<
typename VT1
5014 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
5015 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5017 if( IsTriangular<MT1>::value ) {
5019 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
5023 cgemv( y, A, x, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
5044 template<
typename VT1
5048 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
5049 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
5051 if( IsTriangular<MT1>::value ) {
5053 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
5057 zgemv( y, A, x, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
5079 template<
typename VT1 >
5080 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5090 const ResultType tmp(
serial( rhs ) );
5113 template<
typename VT1 >
5114 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5115 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5121 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5122 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5124 if( left.rows() == 0UL ) {
5127 else if( left.columns() == 0UL ) {
5158 template<
typename VT1 >
5159 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5160 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5170 const ResultType tmp( rhs );
5189 template<
typename VT1 >
5190 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5191 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5197 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5198 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5200 if( left.rows() == 0UL || left.columns() == 0UL ) {
5234 template<
typename VT1 >
5235 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5236 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5242 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5243 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5245 if( left.rows() == 0UL || left.columns() == 0UL ) {
5279 template<
typename VT1 >
5280 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5281 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5291 const ResultType tmp( rhs );
5353 template<
typename T1
5355 inline const typename DisableIf< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >::Type
5361 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
5389 template<
typename T1
5392 inline const typename EnableIf< IsMatMatMultExpr<T1>,
typename MultExprTrait<T1,T2>::Type >::Type
5399 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5414 template<
typename MT,
typename VT >
5432 template<
typename MT,
typename VT,
bool AF >
5437 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type
5438 ,
typename SubvectorExprTrait<const VT,AF>::Type >::Type Type;
BLAZE_ALWAYS_INLINE int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
BLAZE_ALWAYS_INLINE void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:879
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:284
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:386
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
Header file for basic type definitions.
DMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:312
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:113
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:119
Header file for the IsSame and IsStrictlySame type traits.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:420
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:273
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
const size_t SMP_DMATDVECMULT_THRESHOLD
SMP row-major dense matrix/dense vector multiplication threshold.This threshold specifies when a row-...
Definition: Thresholds.h:322
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the RequiresEvaluation type trait.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:272
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
size_t size() const
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:366
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:124
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:121
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:263
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:290
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the IsMatMatMultExpr type trait class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:123
Header file for the IsLower type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:376
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:430
Constraint on the data type.
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:166
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:287
Constraints on the storage order of matrix types.
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:275
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:440
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
Header file for BLAS level 2 functions.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:150
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:441
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:398
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:120
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:278
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:281
Constraint on the data type.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:410
Header file for the HasMutableDataAccess type trait.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:274
Header file for all intrinsic functionality.
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDVecMultExpr.h:276
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix)
Returns the current number of columns of the matrix.
Definition: Matrix.h:332
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:277
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:326
Header file for the IsUpper type trait.
Header file for the MatVecMultExpr base class.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
const size_t DMATDVECMULT_THRESHOLD
Row-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:57
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:122
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849