35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
117 template<
typename MT
150 template<
typename T1 >
151 struct UseSMPAssign {
152 enum { value = ( evaluateMatrix || evaluateVector ) };
162 template<
typename T1,
typename T2,
typename T3 >
163 struct UseBlasKernel {
169 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
185 template<
typename T1,
typename T2,
typename T3 >
186 struct UseVectorizedDefaultKernel {
189 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
224 MT::vectorizable && VT::vectorizable &&
230 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
231 !evaluateVector && VT::smpAssignable };
259 mat_.columns() == 0UL )
263 return mat_(index,index) *
vec_[index];
270 :(
mat_.columns() ) );
273 const size_t jnum( jend - jbegin );
274 const size_t jpos( jbegin + ( ( jnum - 1UL ) &
size_t(-2) ) + 1UL );
276 ElementType res(
mat_(index,jbegin) *
vec_[jbegin] );
278 for(
size_t j=jbegin+1UL; j<jpos; j+=2UL ) {
282 res +=
mat_(index,jpos) *
vec_[jpos];
296 inline ReturnType
at(
size_t index )
const {
297 if( index >=
mat_.rows() ) {
300 return (*
this)[index];
340 template<
typename T >
342 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
352 template<
typename T >
354 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
364 return mat_.isAligned() &&
vec_.isAligned();
376 (
mat_.rows() *
mat_.columns() < DMATDVECMULT_THRESHOLD ) ) &&
377 (
size() > SMP_DMATDVECMULT_THRESHOLD );
400 template<
typename VT1 >
407 if( rhs.
mat_.rows() == 0UL ) {
410 else if( rhs.
mat_.columns() == 0UL ) {
423 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
439 template<
typename VT1
442 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
446 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
447 selectSmallAssignKernel( y, A, x );
449 selectBlasAssignKernel( y, A, x );
468 template<
typename VT1
471 static inline void selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
492 template<
typename VT1
495 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
496 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
498 selectDefaultAssignKernel( y, A, x );
517 template<
typename VT1
520 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
521 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
523 typedef IntrinsicTrait<ElementType> IT;
525 const size_t M( A.rows() );
526 const size_t N( A.columns() );
528 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
532 for( ; (i+8UL) <= M; i+=8UL )
534 const size_t jbegin( ( IsUpper<MT1>::value )
535 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
537 const size_t jend( ( IsLower<MT1>::value )
538 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
542 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
545 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
549 const IntrinsicType x1( x.load(j) );
550 xmm1 = xmm1 + A.load(i ,j) * x1;
551 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
552 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
553 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
554 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
555 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
556 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
557 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
561 y[i+1UL] =
sum( xmm2 );
562 y[i+2UL] =
sum( xmm3 );
563 y[i+3UL] =
sum( xmm4 );
564 y[i+4UL] =
sum( xmm5 );
565 y[i+5UL] =
sum( xmm6 );
566 y[i+6UL] =
sum( xmm7 );
567 y[i+7UL] =
sum( xmm8 );
569 for( ; remainder && j<jend; ++j ) {
570 y[i ] += A(i ,j) * x[j];
571 y[i+1UL] += A(i+1UL,j) * x[j];
572 y[i+2UL] += A(i+2UL,j) * x[j];
573 y[i+3UL] += A(i+3UL,j) * x[j];
574 y[i+4UL] += A(i+4UL,j) * x[j];
575 y[i+5UL] += A(i+5UL,j) * x[j];
576 y[i+6UL] += A(i+6UL,j) * x[j];
577 y[i+7UL] += A(i+7UL,j) * x[j];
581 for( ; (i+4UL) <= M; i+=4UL )
583 const size_t jbegin( ( IsUpper<MT1>::value )
584 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
586 const size_t jend( ( IsLower<MT1>::value )
587 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
591 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
594 IntrinsicType xmm1, xmm2, xmm3, xmm4;
598 const IntrinsicType x1( x.load(j) );
599 xmm1 = xmm1 + A.load(i ,j) * x1;
600 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
601 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
602 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
606 y[i+1UL] =
sum( xmm2 );
607 y[i+2UL] =
sum( xmm3 );
608 y[i+3UL] =
sum( xmm4 );
610 for( ; remainder && j<jend; ++j ) {
611 y[i ] += A(i ,j) * x[j];
612 y[i+1UL] += A(i+1UL,j) * x[j];
613 y[i+2UL] += A(i+2UL,j) * x[j];
614 y[i+3UL] += A(i+3UL,j) * x[j];
618 for( ; (i+3UL) <= M; i+=3UL )
620 const size_t jbegin( ( IsUpper<MT1>::value )
621 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
623 const size_t jend( ( IsLower<MT1>::value )
624 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
628 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
631 IntrinsicType xmm1, xmm2, xmm3;
635 const IntrinsicType x1( x.load(j) );
636 xmm1 = xmm1 + A.load(i ,j) * x1;
637 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
638 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
642 y[i+1UL] =
sum( xmm2 );
643 y[i+2UL] =
sum( xmm3 );
645 for( ; remainder && j<jend; ++j ) {
646 y[i ] += A(i ,j) * x[j];
647 y[i+1UL] += A(i+1UL,j) * x[j];
648 y[i+2UL] += A(i+2UL,j) * x[j];
652 for( ; (i+2UL) <= M; i+=2UL )
654 const size_t jbegin( ( IsUpper<MT1>::value )
655 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
657 const size_t jend( ( IsLower<MT1>::value )
658 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
662 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
665 IntrinsicType xmm1, xmm2;
669 const IntrinsicType x1( x.load(j) );
670 xmm1 = xmm1 + A.load(i ,j) * x1;
671 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
675 y[i+1UL] =
sum( xmm2 );
677 for( ; remainder && j<jend; ++j ) {
678 y[i ] += A(i ,j) * x[j];
679 y[i+1UL] += A(i+1UL,j) * x[j];
685 const size_t jbegin( ( IsUpper<MT1>::value )
686 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
688 const size_t jend( ( IsLower<MT1>::value )
689 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
693 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
700 xmm1 = xmm1 + A.load(i,j) * x.load(j);
705 for( ; remainder && j<jend; ++j ) {
706 y[i] += A(i,j) * x[j];
727 template<
typename VT1
730 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
731 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
733 selectDefaultAssignKernel( y, A, x );
752 template<
typename VT1
755 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
756 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
758 typedef IntrinsicTrait<ElementType> IT;
760 const size_t M( A.rows() );
761 const size_t N( A.columns() );
763 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
769 for( ; (i+8UL) <= M; i+=8UL )
771 const size_t jbegin( ( IsUpper<MT1>::value )
772 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
774 const size_t jend( ( IsLower<MT1>::value )
775 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
779 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
788 const IntrinsicType x1( x.load(j ) );
789 const IntrinsicType x2( x.load(j1) );
790 const IntrinsicType x3( x.load(j2) );
791 const IntrinsicType x4( x.load(j3) );
792 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
793 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
794 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
795 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
796 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
797 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
798 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
799 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
804 const IntrinsicType x1( x.load(j ) );
805 const IntrinsicType x2( x.load(j1) );
806 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
807 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
808 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
809 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
810 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
811 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
812 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
813 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
817 const IntrinsicType x1( x.load(j) );
818 y[i ] +=
sum( A.load(i ,j) * x1 );
819 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
820 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
821 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
822 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
823 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
824 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
825 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
828 for( ; remainder && j<jend; ++j ) {
829 y[i ] += A(i ,j) * x[j];
830 y[i+1UL] += A(i+1UL,j) * x[j];
831 y[i+2UL] += A(i+2UL,j) * x[j];
832 y[i+3UL] += A(i+3UL,j) * x[j];
833 y[i+4UL] += A(i+4UL,j) * x[j];
834 y[i+5UL] += A(i+5UL,j) * x[j];
835 y[i+6UL] += A(i+6UL,j) * x[j];
836 y[i+7UL] += A(i+7UL,j) * x[j];
840 for( ; (i+4UL) <= M; i+=4UL )
842 const size_t jbegin( ( IsUpper<MT1>::value )
843 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
845 const size_t jend( ( IsLower<MT1>::value )
846 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
850 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
859 const IntrinsicType x1( x.load(j ) );
860 const IntrinsicType x2( x.load(j1) );
861 const IntrinsicType x3( x.load(j2) );
862 const IntrinsicType x4( x.load(j3) );
863 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
864 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
865 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
866 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
871 const IntrinsicType x1( x.load(j ) );
872 const IntrinsicType x2( x.load(j1) );
873 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
874 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
875 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
876 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
880 const IntrinsicType x1( x.load(j) );
881 y[i ] +=
sum( A.load(i ,j) * x1 );
882 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
883 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
884 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
887 for( ; remainder && j<jend; ++j ) {
888 y[i ] += A(i ,j) * x[j];
889 y[i+1UL] += A(i+1UL,j) * x[j];
890 y[i+2UL] += A(i+2UL,j) * x[j];
891 y[i+3UL] += A(i+3UL,j) * x[j];
895 for( ; (i+2UL) <= M; i+=2UL )
897 const size_t jbegin( ( IsUpper<MT1>::value )
898 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
900 const size_t jend( ( IsLower<MT1>::value )
901 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
905 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
914 const IntrinsicType x1( x.load(j ) );
915 const IntrinsicType x2( x.load(j1) );
916 const IntrinsicType x3( x.load(j2) );
917 const IntrinsicType x4( x.load(j3) );
918 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
919 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
924 const IntrinsicType x1( x.load(j ) );
925 const IntrinsicType x2( x.load(j1) );
926 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
927 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
931 const IntrinsicType x1( x.load(j) );
932 y[i ] +=
sum( A.load(i ,j) * x1 );
933 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
936 for( ; remainder && j<jend; ++j ) {
937 y[i ] += A(i ,j) * x[j];
938 y[i+1UL] += A(i+1UL,j) * x[j];
944 const size_t jbegin( ( IsUpper<MT1>::value )
945 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
947 const size_t jend( ( IsLower<MT1>::value )
948 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
952 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
961 const IntrinsicType x1( x.load(j ) );
962 const IntrinsicType x2( x.load(j1) );
963 const IntrinsicType x3( x.load(j2) );
964 const IntrinsicType x4( x.load(j3) );
965 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
970 const IntrinsicType x1( x.load(j ) );
971 const IntrinsicType x2( x.load(j1) );
972 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
976 const IntrinsicType x1( x.load(j) );
977 y[i] +=
sum( A.load(i,j) * x1 );
980 for( ; remainder && j<jend; ++j ) {
981 y[i] += A(i,j) * x[j];
1002 template<
typename VT1
1005 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1006 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1008 selectLargeAssignKernel( y, A, x );
1028 template<
typename VT1
1031 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1032 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1036 if( IsTriangular<MT1>::value ) {
1038 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1041 gemv( y, A, x, ET(1), ET(0) );
1061 template<
typename VT1 >
1062 friend inline void assign( SparseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1072 const ResultType tmp(
serial( rhs ) );
1073 assign( ~lhs, tmp );
1091 template<
typename VT1 >
1092 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1098 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1102 LT A(
serial( rhs.mat_ ) );
1103 RT x(
serial( rhs.vec_ ) );
1110 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
1126 template<
typename VT1
1129 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1131 if( ( IsDiagonal<MT1>::value ) ||
1132 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1133 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1134 selectSmallAddAssignKernel( y, A, x );
1136 selectBlasAddAssignKernel( y, A, x );
1155 template<
typename VT1
1158 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1160 y.addAssign( A * x );
1179 template<
typename VT1
1182 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1183 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1185 selectDefaultAddAssignKernel( y, A, x );
1204 template<
typename VT1
1207 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1208 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1210 typedef IntrinsicTrait<ElementType> IT;
1212 const size_t M( A.rows() );
1213 const size_t N( A.columns() );
1215 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1219 for( ; (i+8UL) <= M; i+=8UL )
1221 const size_t jbegin( ( IsUpper<MT1>::value )
1222 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1224 const size_t jend( ( IsLower<MT1>::value )
1225 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1229 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1232 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1236 const IntrinsicType x1( x.load(j) );
1237 xmm1 = xmm1 + A.load(i ,j) * x1;
1238 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1239 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1240 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1241 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1242 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1243 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1244 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1247 y[i ] +=
sum( xmm1 );
1248 y[i+1UL] +=
sum( xmm2 );
1249 y[i+2UL] +=
sum( xmm3 );
1250 y[i+3UL] +=
sum( xmm4 );
1251 y[i+4UL] +=
sum( xmm5 );
1252 y[i+5UL] +=
sum( xmm6 );
1253 y[i+6UL] +=
sum( xmm7 );
1254 y[i+7UL] +=
sum( xmm8 );
1256 for( ; remainder && j<jend; ++j ) {
1257 y[i ] += A(i ,j) * x[j];
1258 y[i+1UL] += A(i+1UL,j) * x[j];
1259 y[i+2UL] += A(i+2UL,j) * x[j];
1260 y[i+3UL] += A(i+3UL,j) * x[j];
1261 y[i+4UL] += A(i+4UL,j) * x[j];
1262 y[i+5UL] += A(i+5UL,j) * x[j];
1263 y[i+6UL] += A(i+6UL,j) * x[j];
1264 y[i+7UL] += A(i+7UL,j) * x[j];
1268 for( ; (i+4UL) <= M; i+=4UL )
1270 const size_t jbegin( ( IsUpper<MT1>::value )
1271 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1273 const size_t jend( ( IsLower<MT1>::value )
1274 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1278 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1281 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1285 const IntrinsicType x1( x.load(j) );
1286 xmm1 = xmm1 + A.load(i ,j) * x1;
1287 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1288 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1289 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1292 y[i ] +=
sum( xmm1 );
1293 y[i+1UL] +=
sum( xmm2 );
1294 y[i+2UL] +=
sum( xmm3 );
1295 y[i+3UL] +=
sum( xmm4 );
1297 for( ; remainder && j<jend; ++j ) {
1298 y[i ] += A(i ,j) * x[j];
1299 y[i+1UL] += A(i+1UL,j) * x[j];
1300 y[i+2UL] += A(i+2UL,j) * x[j];
1301 y[i+3UL] += A(i+3UL,j) * x[j];
1305 for( ; (i+3UL) <= M; i+=3UL )
1307 const size_t jbegin( ( IsUpper<MT1>::value )
1308 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1310 const size_t jend( ( IsLower<MT1>::value )
1311 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1315 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1318 IntrinsicType xmm1, xmm2, xmm3;
1322 const IntrinsicType x1( x.load(j) );
1323 xmm1 = xmm1 + A.load(i ,j) * x1;
1324 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1325 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1328 y[i ] +=
sum( xmm1 );
1329 y[i+1UL] +=
sum( xmm2 );
1330 y[i+2UL] +=
sum( xmm3 );
1332 for( ; remainder && j<jend; ++j ) {
1333 y[i ] += A(i ,j) * x[j];
1334 y[i+1UL] += A(i+1UL,j) * x[j];
1335 y[i+2UL] += A(i+2UL,j) * x[j];
1339 for( ; (i+2UL) <= M; i+=2UL )
1341 const size_t jbegin( ( IsUpper<MT1>::value )
1342 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1344 const size_t jend( ( IsLower<MT1>::value )
1345 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1349 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1352 IntrinsicType xmm1, xmm2;
1356 const IntrinsicType x1( x.load(j) );
1357 xmm1 = xmm1 + A.load(i ,j) * x1;
1358 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1361 y[i ] +=
sum( xmm1 );
1362 y[i+1UL] +=
sum( xmm2 );
1364 for( ; remainder && j<jend; ++j ) {
1365 y[i ] += A(i ,j) * x[j];
1366 y[i+1UL] += A(i+1UL,j) * x[j];
1372 const size_t jbegin( ( IsUpper<MT1>::value )
1373 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1375 const size_t jend( ( IsLower<MT1>::value )
1376 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1380 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1387 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1390 y[i] +=
sum( xmm1 );
1392 for( ; remainder && j<jend; ++j ) {
1393 y[i] += A(i,j) * x[j];
1414 template<
typename VT1
1417 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1418 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1420 selectDefaultAddAssignKernel( y, A, x );
1439 template<
typename VT1
1442 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1443 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1445 typedef IntrinsicTrait<ElementType> IT;
1447 const size_t M( A.rows() );
1448 const size_t N( A.columns() );
1450 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1454 for( ; (i+8UL) <= M; i+=8UL )
1456 const size_t jbegin( ( IsUpper<MT1>::value )
1457 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1459 const size_t jend( ( IsLower<MT1>::value )
1460 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1464 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1473 const IntrinsicType x1( x.load(j ) );
1474 const IntrinsicType x2( x.load(j1) );
1475 const IntrinsicType x3( x.load(j2) );
1476 const IntrinsicType x4( x.load(j3) );
1477 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1478 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1479 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1480 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1481 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
1482 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
1483 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
1484 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
1489 const IntrinsicType x1( x.load(j ) );
1490 const IntrinsicType x2( x.load(j1) );
1491 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1492 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1493 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1494 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1495 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
1496 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
1497 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
1498 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
1502 const IntrinsicType x1( x.load(j) );
1503 y[i ] +=
sum( A.load(i ,j) * x1 );
1504 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1505 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1506 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1507 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
1508 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
1509 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
1510 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
1513 for( ; remainder && j<jend; ++j ) {
1514 y[i ] += A(i ,j) * x[j];
1515 y[i+1UL] += A(i+1UL,j) * x[j];
1516 y[i+2UL] += A(i+2UL,j) * x[j];
1517 y[i+3UL] += A(i+3UL,j) * x[j];
1518 y[i+4UL] += A(i+4UL,j) * x[j];
1519 y[i+5UL] += A(i+5UL,j) * x[j];
1520 y[i+6UL] += A(i+6UL,j) * x[j];
1521 y[i+7UL] += A(i+7UL,j) * x[j];
1525 for( ; (i+4UL) <= M; i+=4UL )
1527 const size_t jbegin( ( IsUpper<MT1>::value )
1528 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1530 const size_t jend( ( IsLower<MT1>::value )
1531 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1535 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1544 const IntrinsicType x1( x.load(j ) );
1545 const IntrinsicType x2( x.load(j1) );
1546 const IntrinsicType x3( x.load(j2) );
1547 const IntrinsicType x4( x.load(j3) );
1548 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1549 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1550 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
1551 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
1556 const IntrinsicType x1( x.load(j ) );
1557 const IntrinsicType x2( x.load(j1) );
1558 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1559 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1560 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
1561 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
1565 const IntrinsicType x1( x.load(j) );
1566 y[i ] +=
sum( A.load(i ,j) * x1 );
1567 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1568 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
1569 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
1572 for( ; remainder && j<jend; ++j ) {
1573 y[i ] += A(i ,j) * x[j];
1574 y[i+1UL] += A(i+1UL,j) * x[j];
1575 y[i+2UL] += A(i+2UL,j) * x[j];
1576 y[i+3UL] += A(i+3UL,j) * x[j];
1580 for( ; (i+2UL) <= M; i+=2UL )
1582 const size_t jbegin( ( IsUpper<MT1>::value )
1583 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1585 const size_t jend( ( IsLower<MT1>::value )
1586 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
1590 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1599 const IntrinsicType x1( x.load(j ) );
1600 const IntrinsicType x2( x.load(j1) );
1601 const IntrinsicType x3( x.load(j2) );
1602 const IntrinsicType x4( x.load(j3) );
1603 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
1604 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
1609 const IntrinsicType x1( x.load(j ) );
1610 const IntrinsicType x2( x.load(j1) );
1611 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
1612 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
1616 const IntrinsicType x1( x.load(j) );
1617 y[i ] +=
sum( A.load(i ,j) * x1 );
1618 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
1621 for( ; remainder && j<jend; ++j ) {
1622 y[i ] += A(i ,j) * x[j];
1623 y[i+1UL] += A(i+1UL,j) * x[j];
1629 const size_t jbegin( ( IsUpper<MT1>::value )
1630 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1632 const size_t jend( ( IsLower<MT1>::value )
1633 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
1637 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1646 const IntrinsicType x1( x.load(j ) );
1647 const IntrinsicType x2( x.load(j1) );
1648 const IntrinsicType x3( x.load(j2) );
1649 const IntrinsicType x4( x.load(j3) );
1650 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
1655 const IntrinsicType x1( x.load(j ) );
1656 const IntrinsicType x2( x.load(j1) );
1657 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
1661 const IntrinsicType x1( x.load(j) );
1662 y[i] +=
sum( A.load(i,j) * x1 );
1665 for( ; remainder && j<jend; ++j ) {
1666 y[i] += A(i,j) * x[j];
1687 template<
typename VT1
1690 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1691 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1693 selectLargeAddAssignKernel( y, A, x );
1713 template<
typename VT1
1716 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
1717 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1721 if( IsTriangular<MT1>::value ) {
1723 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1724 addAssign( y, tmp );
1727 gemv( y, A, x, ET(1), ET(1) );
1751 template<
typename VT1 >
1752 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
1758 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1762 LT A(
serial( rhs.mat_ ) );
1763 RT x(
serial( rhs.vec_ ) );
1770 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1786 template<
typename VT1
1789 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1791 if( ( IsDiagonal<MT1>::value ) ||
1792 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1793 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
1794 selectSmallSubAssignKernel( y, A, x );
1796 selectBlasSubAssignKernel( y, A, x );
1815 template<
typename VT1
1818 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1820 y.subAssign( A * x );
1839 template<
typename VT1
1842 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1843 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1845 selectDefaultSubAssignKernel( y, A, x );
1864 template<
typename VT1
1867 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1868 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1870 typedef IntrinsicTrait<ElementType> IT;
1872 const size_t M( A.rows() );
1873 const size_t N( A.columns() );
1875 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
1879 for( ; (i+8UL) <= M; i+=8UL )
1881 const size_t jbegin( ( IsUpper<MT1>::value )
1882 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1884 const size_t jend( ( IsLower<MT1>::value )
1885 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
1889 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1892 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1896 const IntrinsicType x1( x.load(j) );
1897 xmm1 = xmm1 + A.load(i ,j) * x1;
1898 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1899 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1900 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1901 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1902 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1903 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1904 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1907 y[i ] -=
sum( xmm1 );
1908 y[i+1UL] -=
sum( xmm2 );
1909 y[i+2UL] -=
sum( xmm3 );
1910 y[i+3UL] -=
sum( xmm4 );
1911 y[i+4UL] -=
sum( xmm5 );
1912 y[i+5UL] -=
sum( xmm6 );
1913 y[i+6UL] -=
sum( xmm7 );
1914 y[i+7UL] -=
sum( xmm8 );
1916 for( ; remainder && j<jend; ++j ) {
1917 y[i ] -= A(i ,j) * x[j];
1918 y[i+1UL] -= A(i+1UL,j) * x[j];
1919 y[i+2UL] -= A(i+2UL,j) * x[j];
1920 y[i+3UL] -= A(i+3UL,j) * x[j];
1921 y[i+4UL] -= A(i+4UL,j) * x[j];
1922 y[i+5UL] -= A(i+5UL,j) * x[j];
1923 y[i+6UL] -= A(i+6UL,j) * x[j];
1924 y[i+7UL] -= A(i+7UL,j) * x[j];
1928 for( ; (i+4UL) <= M; i+=4UL )
1930 const size_t jbegin( ( IsUpper<MT1>::value )
1931 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1933 const size_t jend( ( IsLower<MT1>::value )
1934 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
1938 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1941 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1945 const IntrinsicType x1( x.load(j) );
1946 xmm1 = xmm1 + A.load(i ,j) * x1;
1947 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1948 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1949 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1952 y[i ] -=
sum( xmm1 );
1953 y[i+1UL] -=
sum( xmm2 );
1954 y[i+2UL] -=
sum( xmm3 );
1955 y[i+3UL] -=
sum( xmm4 );
1957 for( ; remainder && j<jend; ++j ) {
1958 y[i ] -= A(i ,j) * x[j];
1959 y[i+1UL] -= A(i+1UL,j) * x[j];
1960 y[i+2UL] -= A(i+2UL,j) * x[j];
1961 y[i+3UL] -= A(i+3UL,j) * x[j];
1965 for( ; (i+3UL) <= M; i+=3UL )
1967 const size_t jbegin( ( IsUpper<MT1>::value )
1968 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
1970 const size_t jend( ( IsLower<MT1>::value )
1971 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
1975 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
1978 IntrinsicType xmm1, xmm2, xmm3;
1982 const IntrinsicType x1( x.load(j) );
1983 xmm1 = xmm1 + A.load(i ,j) * x1;
1984 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1985 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1988 y[i ] -=
sum( xmm1 );
1989 y[i+1UL] -=
sum( xmm2 );
1990 y[i+2UL] -=
sum( xmm3 );
1992 for( ; remainder && j<jend; ++j ) {
1993 y[i ] -= A(i ,j) * x[j];
1994 y[i+1UL] -= A(i+1UL,j) * x[j];
1995 y[i+2UL] -= A(i+2UL,j) * x[j];
1999 for( ; (i+2UL) <= M; i+=2UL )
2001 const size_t jbegin( ( IsUpper<MT1>::value )
2002 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2004 const size_t jend( ( IsLower<MT1>::value )
2005 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2009 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2012 IntrinsicType xmm1, xmm2;
2016 const IntrinsicType x1( x.load(j) );
2017 xmm1 = xmm1 + A.load(i ,j) * x1;
2018 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2021 y[i ] -=
sum( xmm1 );
2022 y[i+1UL] -=
sum( xmm2 );
2024 for( ; remainder && j<jend; ++j ) {
2025 y[i ] -= A(i ,j) * x[j];
2026 y[i+1UL] -= A(i+1UL,j) * x[j];
2032 const size_t jbegin( ( IsUpper<MT1>::value )
2033 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2035 const size_t jend( ( IsLower<MT1>::value )
2036 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2040 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2047 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2050 y[i] -=
sum( xmm1 );
2052 for( ; remainder && j<jend; ++j ) {
2053 y[i] -= A(i,j) * x[j];
2074 template<
typename VT1
2077 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2078 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2080 selectDefaultSubAssignKernel( y, A, x );
2099 template<
typename VT1
2102 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
2103 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2105 typedef IntrinsicTrait<ElementType> IT;
2107 const size_t M( A.rows() );
2108 const size_t N( A.columns() );
2110 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
2114 for( ; (i+8UL) <= M; i+=8UL )
2116 const size_t jbegin( ( IsUpper<MT1>::value )
2117 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2119 const size_t jend( ( IsLower<MT1>::value )
2120 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
2124 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2133 const IntrinsicType x1( x.load(j ) );
2134 const IntrinsicType x2( x.load(j1) );
2135 const IntrinsicType x3( x.load(j2) );
2136 const IntrinsicType x4( x.load(j3) );
2137 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2138 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2139 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2140 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2141 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
2142 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
2143 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
2144 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
2149 const IntrinsicType x1( x.load(j ) );
2150 const IntrinsicType x2( x.load(j1) );
2151 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2152 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2153 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2154 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2155 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
2156 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
2157 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
2158 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
2162 const IntrinsicType x1( x.load(j) );
2163 y[i ] -=
sum( A.load(i ,j) * x1 );
2164 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2165 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2166 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2167 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 );
2168 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 );
2169 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 );
2170 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 );
2173 for( ; remainder && j<jend; ++j ) {
2174 y[i ] -= A(i ,j) * x[j];
2175 y[i+1UL] -= A(i+1UL,j) * x[j];
2176 y[i+2UL] -= A(i+2UL,j) * x[j];
2177 y[i+3UL] -= A(i+3UL,j) * x[j];
2178 y[i+4UL] -= A(i+4UL,j) * x[j];
2179 y[i+5UL] -= A(i+5UL,j) * x[j];
2180 y[i+6UL] -= A(i+6UL,j) * x[j];
2181 y[i+7UL] -= A(i+7UL,j) * x[j];
2185 for( ; (i+4UL) <= M; i+=4UL )
2187 const size_t jbegin( ( IsUpper<MT1>::value )
2188 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2190 const size_t jend( ( IsLower<MT1>::value )
2191 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
2195 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2204 const IntrinsicType x1( x.load(j ) );
2205 const IntrinsicType x2( x.load(j1) );
2206 const IntrinsicType x3( x.load(j2) );
2207 const IntrinsicType x4( x.load(j3) );
2208 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2209 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2210 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
2211 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
2216 const IntrinsicType x1( x.load(j ) );
2217 const IntrinsicType x2( x.load(j1) );
2218 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2219 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2220 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
2221 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
2225 const IntrinsicType x1( x.load(j) );
2226 y[i ] -=
sum( A.load(i ,j) * x1 );
2227 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2228 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 );
2229 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 );
2232 for( ; remainder && j<jend; ++j ) {
2233 y[i ] -= A(i ,j) * x[j];
2234 y[i+1UL] -= A(i+1UL,j) * x[j];
2235 y[i+2UL] -= A(i+2UL,j) * x[j];
2236 y[i+3UL] -= A(i+3UL,j) * x[j];
2240 for( ; (i+2UL) <= M; i+=2UL )
2242 const size_t jbegin( ( IsUpper<MT1>::value )
2243 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2245 const size_t jend( ( IsLower<MT1>::value )
2246 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
2250 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2259 const IntrinsicType x1( x.load(j ) );
2260 const IntrinsicType x2( x.load(j1) );
2261 const IntrinsicType x3( x.load(j2) );
2262 const IntrinsicType x4( x.load(j3) );
2263 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
2264 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
2269 const IntrinsicType x1( x.load(j ) );
2270 const IntrinsicType x2( x.load(j1) );
2271 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
2272 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
2276 const IntrinsicType x1( x.load(j) );
2277 y[i ] -=
sum( A.load(i ,j) * x1 );
2278 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 );
2281 for( ; remainder && j<jend; ++j ) {
2282 y[i ] -= A(i ,j) * x[j];
2283 y[i+1UL] -= A(i+1UL,j) * x[j];
2289 const size_t jbegin( ( IsUpper<MT1>::value )
2290 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
2292 const size_t jend( ( IsLower<MT1>::value )
2293 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
2297 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
2306 const IntrinsicType x1( x.load(j ) );
2307 const IntrinsicType x2( x.load(j1) );
2308 const IntrinsicType x3( x.load(j2) );
2309 const IntrinsicType x4( x.load(j3) );
2310 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
2315 const IntrinsicType x1( x.load(j ) );
2316 const IntrinsicType x2( x.load(j1) );
2317 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
2321 const IntrinsicType x1( x.load(j) );
2322 y[i] -=
sum( A.load(i,j) * x1 );
2325 for( ; remainder && j<jend; ++j ) {
2326 y[i] -= A(i,j) * x[j];
2347 template<
typename VT1
2350 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
2351 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2353 selectLargeSubAssignKernel( y, A, x );
2373 template<
typename VT1
2376 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2> >::Type
2377 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
2381 if( IsTriangular<MT1>::value ) {
2383 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2384 subAssign( y, tmp );
2387 gemv( y, A, x, ET(-1), ET(1) );
2411 template<
typename VT1 >
2412 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DMatDVecMultExpr& rhs )
2422 const ResultType tmp(
serial( rhs ) );
2423 multAssign( ~lhs, tmp );
2447 template<
typename VT1 >
2448 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2455 if( rhs.mat_.rows() == 0UL ) {
2458 else if( rhs.mat_.columns() == 0UL ) {
2491 template<
typename VT1 >
2492 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2503 const ResultType tmp( rhs );
2524 template<
typename VT1 >
2525 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2532 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2568 template<
typename VT1 >
2569 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2576 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2612 template<
typename VT1 >
2613 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2624 const ResultType tmp( rhs );
2663 template<
typename MT
2667 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
2668 ,
private VecScalarMultExpr
2669 ,
private Computation
2673 typedef DMatDVecMultExpr<MT,VT> MVM;
2685 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2686 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2691 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
2699 template<
typename T1 >
2700 struct UseSMPAssign {
2701 enum { value = ( evaluateMatrix || evaluateVector ) };
2709 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2710 struct UseBlasKernel {
2712 HasMutableDataAccess<T1>::value &&
2713 HasConstDataAccess<T2>::value &&
2714 HasConstDataAccess<T3>::value &&
2715 !IsDiagonal<T2>::value &&
2716 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2717 IsBlasCompatible<typename T1::ElementType>::value &&
2718 IsBlasCompatible<typename T2::ElementType>::value &&
2719 IsBlasCompatible<typename T3::ElementType>::value &&
2720 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
2721 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
2722 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
2731 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2732 struct UseVectorizedDefaultKernel {
2734 !IsDiagonal<T2>::value &&
2735 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2736 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2737 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2738 IsSame<typename T1::ElementType,T4>::value &&
2739 IntrinsicTrait<typename T1::ElementType>::addition &&
2740 IntrinsicTrait<typename T1::ElementType>::multiplication };
2746 typedef DVecScalarMultExpr<MVM,ST,false>
This;
2747 typedef typename MultTrait<RES,ST>::Type
ResultType;
2750 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2755 typedef const DMatDVecMultExpr<MT,VT>
LeftOperand;
2761 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
2764 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
2769 enum { vectorizable = !IsDiagonal<MT>::value &&
2770 MT::vectorizable && VT::vectorizable &&
2771 IsSame<MET,VET>::value &&
2772 IsSame<MET,ST>::value &&
2773 IntrinsicTrait<MET>::addition &&
2774 IntrinsicTrait<MET>::multiplication };
2777 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2778 !evaluateVector && VT::smpAssignable };
2787 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2799 inline ReturnType
operator[](
size_t index )
const {
2801 return vector_[index] * scalar_;
2812 inline ReturnType
at(
size_t index )
const {
2813 if( index >= vector_.size() ) {
2816 return (*
this)[index];
2825 inline size_t size()
const {
2826 return vector_.size();
2856 template<
typename T >
2857 inline bool canAlias(
const T* alias )
const {
2858 return vector_.canAlias( alias );
2868 template<
typename T >
2869 inline bool isAliased(
const T* alias )
const {
2870 return vector_.isAliased( alias );
2880 return vector_.isAligned();
2890 typename MVM::LeftOperand A( vector_.leftOperand() );
2892 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2893 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) ) &&
2894 (
size() > SMP_DMATDVECMULT_THRESHOLD );
2900 LeftOperand vector_;
2901 RightOperand scalar_;
2916 template<
typename VT1 >
2917 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2923 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2924 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2926 if( left.rows() == 0UL ) {
2929 else if( left.columns() == 0UL ) {
2942 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2957 template<
typename VT1
2961 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2963 if( ( IsDiagonal<MT1>::value ) ||
2964 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2965 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
2966 selectSmallAssignKernel( y, A, x, scalar );
2968 selectBlasAssignKernel( y, A, x, scalar );
2986 template<
typename VT1
2990 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2991 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2993 y.assign( A * x * scalar );
3011 template<
typename VT1
3015 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3016 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3018 selectDefaultAssignKernel( y, A, x, scalar );
3036 template<
typename VT1
3040 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3041 selectSmallAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3043 typedef IntrinsicTrait<ElementType> IT;
3045 const size_t M( A.rows() );
3046 const size_t N( A.columns() );
3048 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3052 for( ; (i+8UL) <= M; i+=8UL )
3054 const size_t jbegin( ( IsUpper<MT1>::value )
3055 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3057 const size_t jend( ( IsLower<MT1>::value )
3058 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3062 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3065 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3069 const IntrinsicType x1( x.load(j) );
3070 xmm1 = xmm1 + A.load(i ,j) * x1;
3071 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3072 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3073 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3074 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3075 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3076 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3077 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3080 y[i ] =
sum( xmm1 ) * scalar;
3081 y[i+1UL] =
sum( xmm2 ) * scalar;
3082 y[i+2UL] =
sum( xmm3 ) * scalar;
3083 y[i+3UL] =
sum( xmm4 ) * scalar;
3084 y[i+4UL] =
sum( xmm5 ) * scalar;
3085 y[i+5UL] =
sum( xmm6 ) * scalar;
3086 y[i+6UL] =
sum( xmm7 ) * scalar;
3087 y[i+7UL] =
sum( xmm8 ) * scalar;
3089 for( ; remainder && j<jend; ++j ) {
3090 y[i ] += A(i ,j) * x[j] * scalar;
3091 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3092 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3093 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3094 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3095 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3096 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3097 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3101 for( ; (i+4UL) <= M; i+=4UL )
3103 const size_t jbegin( ( IsUpper<MT1>::value )
3104 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3106 const size_t jend( ( IsLower<MT1>::value )
3107 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3111 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3114 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3118 const IntrinsicType x1( x.load(j) );
3119 xmm1 = xmm1 + A.load(i ,j) * x1;
3120 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3121 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3122 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3125 y[i ] =
sum( xmm1 ) * scalar;
3126 y[i+1UL] =
sum( xmm2 ) * scalar;
3127 y[i+2UL] =
sum( xmm3 ) * scalar;
3128 y[i+3UL] =
sum( xmm4 ) * scalar;
3130 for( ; remainder && j<jend; ++j ) {
3131 y[i ] += A(i ,j) * x[j] * scalar;
3132 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3133 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3134 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3138 for( ; (i+3UL) <= M; i+=3UL )
3140 const size_t jbegin( ( IsUpper<MT1>::value )
3141 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3143 const size_t jend( ( IsLower<MT1>::value )
3144 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3148 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3151 IntrinsicType xmm1, xmm2, xmm3;
3155 const IntrinsicType x1( x.load(j) );
3156 xmm1 = xmm1 + A.load(i ,j) * x1;
3157 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3158 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3161 y[i ] =
sum( xmm1 ) * scalar;
3162 y[i+1UL] =
sum( xmm2 ) * scalar;
3163 y[i+2UL] =
sum( xmm3 ) * scalar;
3165 for( ; remainder && j<jend; ++j ) {
3166 y[i ] += A(i ,j) * x[j] * scalar;
3167 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3168 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3172 for( ; (i+2UL) <= M; i+=2UL )
3174 const size_t jbegin( ( IsUpper<MT1>::value )
3175 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3177 const size_t jend( ( IsLower<MT1>::value )
3178 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3182 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3185 IntrinsicType xmm1, xmm2;
3189 const IntrinsicType x1( x.load(j) );
3190 xmm1 = xmm1 + A.load(i ,j) * x1;
3191 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3194 y[i ] =
sum( xmm1 ) * scalar;
3195 y[i+1UL] =
sum( xmm2 ) * scalar;
3197 for( ; remainder && j<jend; ++j ) {
3198 y[i ] += A(i ,j) * x[j] * scalar;
3199 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3205 const size_t jbegin( ( IsUpper<MT1>::value )
3206 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3208 const size_t jend( ( IsLower<MT1>::value )
3209 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3213 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3220 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3223 y[i] =
sum( xmm1 ) * scalar;
3225 for( ; remainder && j<jend; ++j ) {
3226 y[i] += A(i,j) * x[j] * scalar;
3246 template<
typename VT1
3250 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3251 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3253 selectDefaultAssignKernel( y, A, x, scalar );
3271 template<
typename VT1
3275 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3276 selectLargeAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3278 typedef IntrinsicTrait<ElementType> IT;
3280 const size_t M( A.rows() );
3281 const size_t N( A.columns() );
3283 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3289 for( ; (i+8UL) <= M; i+=8UL )
3291 const size_t jbegin( ( IsUpper<MT1>::value )
3292 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3294 const size_t jend( ( IsLower<MT1>::value )
3295 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3299 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3308 const IntrinsicType x1( x.load(j ) );
3309 const IntrinsicType x2( x.load(j1) );
3310 const IntrinsicType x3( x.load(j2) );
3311 const IntrinsicType x4( x.load(j3) );
3312 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3313 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3314 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3315 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3316 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 );
3317 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 );
3318 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 );
3319 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 );
3324 const IntrinsicType x1( x.load(j ) );
3325 const IntrinsicType x2( x.load(j1) );
3326 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3327 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3328 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3329 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3330 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 );
3331 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 );
3332 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 );
3333 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 );
3337 const IntrinsicType x1( x.load(j) );
3338 y[i ] +=
sum( A.load(i ,j) * x1 );
3339 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3340 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3341 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3342 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 );
3343 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 );
3344 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 );
3345 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 );
3348 for( ; remainder && j<jend; ++j ) {
3349 y[i ] += A(i ,j) * x[j];
3350 y[i+1UL] += A(i+1UL,j) * x[j];
3351 y[i+2UL] += A(i+2UL,j) * x[j];
3352 y[i+3UL] += A(i+3UL,j) * x[j];
3353 y[i+4UL] += A(i+4UL,j) * x[j];
3354 y[i+5UL] += A(i+5UL,j) * x[j];
3355 y[i+6UL] += A(i+6UL,j) * x[j];
3356 y[i+7UL] += A(i+7UL,j) * x[j];
3369 for( ; (i+4UL) <= M; i+=4UL )
3371 const size_t jbegin( ( IsUpper<MT1>::value )
3372 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3374 const size_t jend( ( IsLower<MT1>::value )
3375 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3379 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3388 const IntrinsicType x1( x.load(j ) );
3389 const IntrinsicType x2( x.load(j1) );
3390 const IntrinsicType x3( x.load(j2) );
3391 const IntrinsicType x4( x.load(j3) );
3392 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3393 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3394 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 );
3395 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 );
3400 const IntrinsicType x1( x.load(j ) );
3401 const IntrinsicType x2( x.load(j1) );
3402 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3403 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3404 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 );
3405 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 );
3409 const IntrinsicType x1( x.load(j) );
3410 y[i ] +=
sum( A.load(i ,j) * x1 );
3411 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3412 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 );
3413 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 );
3416 for( ; remainder && j<jend; ++j ) {
3417 y[i ] += A(i ,j) * x[j];
3418 y[i+1UL] += A(i+1UL,j) * x[j];
3419 y[i+2UL] += A(i+2UL,j) * x[j];
3420 y[i+3UL] += A(i+3UL,j) * x[j];
3429 for( ; (i+2UL) <= M; i+=2UL )
3431 const size_t jbegin( ( IsUpper<MT1>::value )
3432 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3434 const size_t jend( ( IsLower<MT1>::value )
3435 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3439 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3448 const IntrinsicType x1( x.load(j ) );
3449 const IntrinsicType x2( x.load(j1) );
3450 const IntrinsicType x3( x.load(j2) );
3451 const IntrinsicType x4( x.load(j3) );
3452 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 );
3453 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 );
3458 const IntrinsicType x1( x.load(j ) );
3459 const IntrinsicType x2( x.load(j1) );
3460 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 );
3461 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 );
3465 const IntrinsicType x1( x.load(j) );
3466 y[i ] +=
sum( A.load(i ,j) * x1 );
3467 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 );
3470 for( ; remainder && j<jend; ++j ) {
3471 y[i ] += A(i ,j) * x[j];
3472 y[i+1UL] += A(i+1UL,j) * x[j];
3481 const size_t jbegin( ( IsUpper<MT1>::value )
3482 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3484 const size_t jend( ( IsLower<MT1>::value )
3485 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3489 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3498 const IntrinsicType x1( x.load(j ) );
3499 const IntrinsicType x2( x.load(j1) );
3500 const IntrinsicType x3( x.load(j2) );
3501 const IntrinsicType x4( x.load(j3) );
3502 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 );
3507 const IntrinsicType x1( x.load(j ) );
3508 const IntrinsicType x2( x.load(j1) );
3509 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 );
3513 const IntrinsicType x1( x.load(j) );
3514 y[i] +=
sum( A.load(i,j) * x1 );
3517 for( ; remainder && j<jend; ++j ) {
3518 y[i] += A(i,j) * x[j];
3540 template<
typename VT1
3544 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
3545 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3547 selectLargeAssignKernel( y, A, x, scalar );
3566 template<
typename VT1
3570 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
3571 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3575 if( IsTriangular<MT1>::value ) {
3576 assign( y, scalar * x );
3577 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3580 gemv( y, A, x, ET(scalar), ET(0) );
3598 template<
typename VT1 >
3599 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3609 const ResultType tmp(
serial( rhs ) );
3610 assign( ~lhs, tmp );
3626 template<
typename VT1 >
3627 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3633 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3634 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3636 if( left.rows() == 0UL || left.columns() == 0UL ) {
3648 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
3663 template<
typename VT1
3667 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3669 if( ( IsDiagonal<MT1>::value ) ||
3670 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3671 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
3672 selectSmallAddAssignKernel( y, A, x, scalar );
3674 selectBlasAddAssignKernel( y, A, x, scalar );
3692 template<
typename VT1
3696 static inline void selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3698 y.addAssign( A * x * scalar );
3716 template<
typename VT1
3720 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3721 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3723 selectDefaultAddAssignKernel( y, A, x, scalar );
3741 template<
typename VT1
3745 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3746 selectSmallAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3748 typedef IntrinsicTrait<ElementType> IT;
3750 const size_t M( A.rows() );
3751 const size_t N( A.columns() );
3753 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3757 for( ; (i+8UL) <= M; i+=8UL )
3759 const size_t jbegin( ( IsUpper<MT1>::value )
3760 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3762 const size_t jend( ( IsLower<MT1>::value )
3763 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
3767 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3770 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3774 const IntrinsicType x1( x.load(j) );
3775 xmm1 = xmm1 + A.load(i ,j) * x1;
3776 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3777 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3778 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3779 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
3780 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
3781 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
3782 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3785 y[i ] +=
sum( xmm1 ) * scalar;
3786 y[i+1UL] +=
sum( xmm2 ) * scalar;
3787 y[i+2UL] +=
sum( xmm3 ) * scalar;
3788 y[i+3UL] +=
sum( xmm4 ) * scalar;
3789 y[i+4UL] +=
sum( xmm5 ) * scalar;
3790 y[i+5UL] +=
sum( xmm6 ) * scalar;
3791 y[i+6UL] +=
sum( xmm7 ) * scalar;
3792 y[i+7UL] +=
sum( xmm8 ) * scalar;
3794 for( ; remainder && j<jend; ++j ) {
3795 y[i ] += A(i ,j) * x[j] * scalar;
3796 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3797 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3798 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3799 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
3800 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
3801 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
3802 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
3806 for( ; (i+4UL) <= M; i+=4UL )
3808 const size_t jbegin( ( IsUpper<MT1>::value )
3809 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3811 const size_t jend( ( IsLower<MT1>::value )
3812 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
3816 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3819 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3823 const IntrinsicType x1( x.load(j) );
3824 xmm1 = xmm1 + A.load(i ,j) * x1;
3825 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3826 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3827 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3830 y[i ] +=
sum( xmm1 ) * scalar;
3831 y[i+1UL] +=
sum( xmm2 ) * scalar;
3832 y[i+2UL] +=
sum( xmm3 ) * scalar;
3833 y[i+3UL] +=
sum( xmm4 ) * scalar;
3835 for( ; remainder && j<jend; ++j ) {
3836 y[i ] += A(i ,j) * x[j] * scalar;
3837 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3838 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3839 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
3843 for( ; (i+3UL) <= M; i+=3UL )
3845 const size_t jbegin( ( IsUpper<MT1>::value )
3846 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3848 const size_t jend( ( IsLower<MT1>::value )
3849 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
3853 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3856 IntrinsicType xmm1, xmm2, xmm3;
3860 const IntrinsicType x1( x.load(j) );
3861 xmm1 = xmm1 + A.load(i ,j) * x1;
3862 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3863 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3866 y[i ] +=
sum( xmm1 ) * scalar;
3867 y[i+1UL] +=
sum( xmm2 ) * scalar;
3868 y[i+2UL] +=
sum( xmm3 ) * scalar;
3870 for( ; remainder && j<jend; ++j ) {
3871 y[i ] += A(i ,j) * x[j] * scalar;
3872 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3873 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
3877 for( ; (i+2UL) <= M; i+=2UL )
3879 const size_t jbegin( ( IsUpper<MT1>::value )
3880 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3882 const size_t jend( ( IsLower<MT1>::value )
3883 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
3887 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3890 IntrinsicType xmm1, xmm2;
3894 const IntrinsicType x1( x.load(j) );
3895 xmm1 = xmm1 + A.load(i ,j) * x1;
3896 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3899 y[i ] +=
sum( xmm1 ) * scalar;
3900 y[i+1UL] +=
sum( xmm2 ) * scalar;
3902 for( ; remainder && j<jend; ++j ) {
3903 y[i ] += A(i ,j) * x[j] * scalar;
3904 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
3910 const size_t jbegin( ( IsUpper<MT1>::value )
3911 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3913 const size_t jend( ( IsLower<MT1>::value )
3914 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
3918 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
3925 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3928 y[i] +=
sum( xmm1 ) * scalar;
3930 for( ; remainder && j<jend; ++j ) {
3931 y[i] += A(i,j) * x[j] * scalar;
3951 template<
typename VT1
3955 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3956 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3958 selectDefaultAddAssignKernel( y, A, x, scalar );
3976 template<
typename VT1
3980 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3981 selectLargeAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3983 typedef IntrinsicTrait<ElementType> IT;
3985 const size_t M( A.rows() );
3986 const size_t N( A.columns() );
3988 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
3992 for( ; (i+8UL) <= M; i+=8UL )
3994 const size_t jbegin( ( IsUpper<MT1>::value )
3995 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
3997 const size_t jend( ( IsLower<MT1>::value )
3998 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4002 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4011 const IntrinsicType x1( x.load(j ) );
4012 const IntrinsicType x2( x.load(j1) );
4013 const IntrinsicType x3( x.load(j2) );
4014 const IntrinsicType x4( x.load(j3) );
4015 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4016 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4017 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4018 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4019 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4020 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4021 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4022 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4027 const IntrinsicType x1( x.load(j ) );
4028 const IntrinsicType x2( x.load(j1) );
4029 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4030 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4031 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4032 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4033 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4034 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4035 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4036 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4040 const IntrinsicType x1( x.load(j) );
4041 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4042 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4043 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4044 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4045 y[i+4UL] +=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4046 y[i+5UL] +=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4047 y[i+6UL] +=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4048 y[i+7UL] +=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4051 for( ; remainder && j<jend; ++j ) {
4052 y[i ] += A(i ,j) * x[j] * scalar;
4053 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4054 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4055 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4056 y[i+4UL] += A(i+4UL,j) * x[j] * scalar;
4057 y[i+5UL] += A(i+5UL,j) * x[j] * scalar;
4058 y[i+6UL] += A(i+6UL,j) * x[j] * scalar;
4059 y[i+7UL] += A(i+7UL,j) * x[j] * scalar;
4063 for( ; (i+4UL) <= M; i+=4UL )
4065 const size_t jbegin( ( IsUpper<MT1>::value )
4066 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4068 const size_t jend( ( IsLower<MT1>::value )
4069 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4073 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4082 const IntrinsicType x1( x.load(j ) );
4083 const IntrinsicType x2( x.load(j1) );
4084 const IntrinsicType x3( x.load(j2) );
4085 const IntrinsicType x4( x.load(j3) );
4086 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4087 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4088 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4089 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4094 const IntrinsicType x1( x.load(j ) );
4095 const IntrinsicType x2( x.load(j1) );
4096 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4097 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4098 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4099 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4103 const IntrinsicType x1( x.load(j) );
4104 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4105 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4106 y[i+2UL] +=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4107 y[i+3UL] +=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4110 for( ; remainder && j<jend; ++j ) {
4111 y[i ] += A(i ,j) * x[j] * scalar;
4112 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4113 y[i+2UL] += A(i+2UL,j) * x[j] * scalar;
4114 y[i+3UL] += A(i+3UL,j) * x[j] * scalar;
4118 for( ; (i+2UL) <= M; i+=2UL )
4120 const size_t jbegin( ( IsUpper<MT1>::value )
4121 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4123 const size_t jend( ( IsLower<MT1>::value )
4124 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4128 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4137 const IntrinsicType x1( x.load(j ) );
4138 const IntrinsicType x2( x.load(j1) );
4139 const IntrinsicType x3( x.load(j2) );
4140 const IntrinsicType x4( x.load(j3) );
4141 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4142 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4147 const IntrinsicType x1( x.load(j ) );
4148 const IntrinsicType x2( x.load(j1) );
4149 y[i ] +=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4150 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4154 const IntrinsicType x1( x.load(j) );
4155 y[i ] +=
sum( A.load(i ,j) * x1 ) * scalar;
4156 y[i+1UL] +=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4159 for( ; remainder && j<jend; ++j ) {
4160 y[i ] += A(i ,j) * x[j] * scalar;
4161 y[i+1UL] += A(i+1UL,j) * x[j] * scalar;
4167 const size_t jbegin( ( IsUpper<MT1>::value )
4168 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4170 const size_t jend( ( IsLower<MT1>::value )
4171 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4175 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4184 const IntrinsicType x1( x.load(j ) );
4185 const IntrinsicType x2( x.load(j1) );
4186 const IntrinsicType x3( x.load(j2) );
4187 const IntrinsicType x4( x.load(j3) );
4188 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4193 const IntrinsicType x1( x.load(j ) );
4194 const IntrinsicType x2( x.load(j1) );
4195 y[i] +=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4199 const IntrinsicType x1( x.load(j) );
4200 y[i] +=
sum( A.load(i,j) * x1 ) * scalar;
4203 for( ; remainder && j<jend; ++j ) {
4204 y[i] += A(i,j) * x[j] * scalar;
4224 template<
typename VT1
4228 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4229 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4231 selectLargeAddAssignKernel( y, A, x, scalar );
4250 template<
typename VT1
4254 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4255 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4259 if( IsTriangular<MT1>::value ) {
4261 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4262 addAssign( y, tmp );
4265 gemv( y, A, x, ET(scalar), ET(1) );
4287 template<
typename VT1 >
4288 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4294 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
4295 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
4297 if( left.rows() == 0UL || left.columns() == 0UL ) {
4309 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
4324 template<
typename VT1
4328 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4330 if( ( IsDiagonal<MT1>::value ) ||
4331 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4332 ( A.rows() * A.columns() < DMATDVECMULT_THRESHOLD ) )
4333 selectSmallSubAssignKernel( y, A, x, scalar );
4335 selectBlasSubAssignKernel( y, A, x, scalar );
4353 template<
typename VT1
4357 static inline void selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4359 y.subAssign( A * x * scalar );
4377 template<
typename VT1
4381 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4382 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4384 selectDefaultSubAssignKernel( y, A, x, scalar );
4402 template<
typename VT1
4406 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4407 selectSmallSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4409 typedef IntrinsicTrait<ElementType> IT;
4411 const size_t M( A.rows() );
4412 const size_t N( A.columns() );
4414 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4418 for( ; (i+8UL) <= M; i+=8UL )
4420 const size_t jbegin( ( IsUpper<MT1>::value )
4421 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4423 const size_t jend( ( IsLower<MT1>::value )
4424 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4428 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4431 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4435 const IntrinsicType x1( x.load(j) );
4436 xmm1 = xmm1 + A.load(i ,j) * x1;
4437 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4438 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4439 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4440 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
4441 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
4442 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
4443 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
4446 y[i ] -=
sum( xmm1 ) * scalar;
4447 y[i+1UL] -=
sum( xmm2 ) * scalar;
4448 y[i+2UL] -=
sum( xmm3 ) * scalar;
4449 y[i+3UL] -=
sum( xmm4 ) * scalar;
4450 y[i+4UL] -=
sum( xmm5 ) * scalar;
4451 y[i+5UL] -=
sum( xmm6 ) * scalar;
4452 y[i+6UL] -=
sum( xmm7 ) * scalar;
4453 y[i+7UL] -=
sum( xmm8 ) * scalar;
4455 for( ; remainder && j<jend; ++j ) {
4456 y[i ] -= A(i ,j) * x[j] * scalar;
4457 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4458 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4459 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4460 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4461 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4462 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4463 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4467 for( ; (i+4UL) <= M; i+=4UL )
4469 const size_t jbegin( ( IsUpper<MT1>::value )
4470 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4472 const size_t jend( ( IsLower<MT1>::value )
4473 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4477 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4480 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4484 const IntrinsicType x1( x.load(j) );
4485 xmm1 = xmm1 + A.load(i ,j) * x1;
4486 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4487 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4488 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
4491 y[i ] -=
sum( xmm1 ) * scalar;
4492 y[i+1UL] -=
sum( xmm2 ) * scalar;
4493 y[i+2UL] -=
sum( xmm3 ) * scalar;
4494 y[i+3UL] -=
sum( xmm4 ) * scalar;
4496 for( ; remainder && j<jend; ++j ) {
4497 y[i ] -= A(i ,j) * x[j] * scalar;
4498 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4499 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4500 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4504 for( ; (i+3UL) <= M; i+=3UL )
4506 const size_t jbegin( ( IsUpper<MT1>::value )
4507 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4509 const size_t jend( ( IsLower<MT1>::value )
4510 ?( IsStrictlyLower<MT1>::value ? i+2UL : i+3UL )
4514 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4517 IntrinsicType xmm1, xmm2, xmm3;
4521 const IntrinsicType x1( x.load(j) );
4522 xmm1 = xmm1 + A.load(i ,j) * x1;
4523 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4524 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
4527 y[i ] -=
sum( xmm1 ) * scalar;
4528 y[i+1UL] -=
sum( xmm2 ) * scalar;
4529 y[i+2UL] -=
sum( xmm3 ) * scalar;
4531 for( ; remainder && j<jend; ++j ) {
4532 y[i ] -= A(i ,j) * x[j] * scalar;
4533 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4534 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4538 for( ; (i+2UL) <= M; i+=2UL )
4540 const size_t jbegin( ( IsUpper<MT1>::value )
4541 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4543 const size_t jend( ( IsLower<MT1>::value )
4544 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4548 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4551 IntrinsicType xmm1, xmm2;
4555 const IntrinsicType x1( x.load(j) );
4556 xmm1 = xmm1 + A.load(i ,j) * x1;
4557 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
4560 y[i ] -=
sum( xmm1 ) * scalar;
4561 y[i+1UL] -=
sum( xmm2 ) * scalar;
4563 for( ; remainder && j<jend; ++j ) {
4564 y[i ] -= A(i ,j) * x[j] * scalar;
4565 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4571 const size_t jbegin( ( IsUpper<MT1>::value )
4572 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4574 const size_t jend( ( IsLower<MT1>::value )
4575 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4579 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4586 xmm1 = xmm1 + A.load(i,j) * x.load(j);
4589 y[i] -=
sum( xmm1 ) * scalar;
4591 for( ; remainder && j<jend; ++j ) {
4592 y[i] -= A(i,j) * x[j] * scalar;
4612 template<
typename VT1
4616 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4617 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4619 selectDefaultSubAssignKernel( y, A, x, scalar );
4637 template<
typename VT1
4641 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
4642 selectLargeSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4644 typedef IntrinsicTrait<ElementType> IT;
4646 const size_t M( A.rows() );
4647 const size_t N( A.columns() );
4649 const bool remainder( !IsPadded<MT1>::value || !IsPadded<VT2>::value );
4653 for( ; (i+8UL) <= M; i+=8UL )
4655 const size_t jbegin( ( IsUpper<MT1>::value )
4656 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4658 const size_t jend( ( IsLower<MT1>::value )
4659 ?( IsStrictlyLower<MT1>::value ? i+7UL : i+8UL )
4663 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4672 const IntrinsicType x1( x.load(j ) );
4673 const IntrinsicType x2( x.load(j1) );
4674 const IntrinsicType x3( x.load(j2) );
4675 const IntrinsicType x4( x.load(j3) );
4676 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4677 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4678 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4679 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4680 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 + A.load(i+4UL,j2) * x3 + A.load(i+4UL,j3) * x4 ) * scalar;
4681 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 + A.load(i+5UL,j2) * x3 + A.load(i+5UL,j3) * x4 ) * scalar;
4682 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 + A.load(i+6UL,j2) * x3 + A.load(i+6UL,j3) * x4 ) * scalar;
4683 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 + A.load(i+7UL,j2) * x3 + A.load(i+7UL,j3) * x4 ) * scalar;
4688 const IntrinsicType x1( x.load(j ) );
4689 const IntrinsicType x2( x.load(j1) );
4690 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4691 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4692 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4693 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4694 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 + A.load(i+4UL,j1) * x2 ) * scalar;
4695 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 + A.load(i+5UL,j1) * x2 ) * scalar;
4696 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 + A.load(i+6UL,j1) * x2 ) * scalar;
4697 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 + A.load(i+7UL,j1) * x2 ) * scalar;
4701 const IntrinsicType x1( x.load(j) );
4702 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4703 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4704 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4705 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4706 y[i+4UL] -=
sum( A.load(i+4UL,j) * x1 ) * scalar;
4707 y[i+5UL] -=
sum( A.load(i+5UL,j) * x1 ) * scalar;
4708 y[i+6UL] -=
sum( A.load(i+6UL,j) * x1 ) * scalar;
4709 y[i+7UL] -=
sum( A.load(i+7UL,j) * x1 ) * scalar;
4712 for( ; remainder && j<jend; ++j ) {
4713 y[i ] -= A(i ,j) * x[j] * scalar;
4714 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4715 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4716 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4717 y[i+4UL] -= A(i+4UL,j) * x[j] * scalar;
4718 y[i+5UL] -= A(i+5UL,j) * x[j] * scalar;
4719 y[i+6UL] -= A(i+6UL,j) * x[j] * scalar;
4720 y[i+7UL] -= A(i+7UL,j) * x[j] * scalar;
4724 for( ; (i+4UL) <= M; i+=4UL )
4726 const size_t jbegin( ( IsUpper<MT1>::value )
4727 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4729 const size_t jend( ( IsLower<MT1>::value )
4730 ?( IsStrictlyLower<MT1>::value ? i+3UL : i+4UL )
4734 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4743 const IntrinsicType x1( x.load(j ) );
4744 const IntrinsicType x2( x.load(j1) );
4745 const IntrinsicType x3( x.load(j2) );
4746 const IntrinsicType x4( x.load(j3) );
4747 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4748 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4749 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 + A.load(i+2UL,j2) * x3 + A.load(i+2UL,j3) * x4 ) * scalar;
4750 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 + A.load(i+3UL,j2) * x3 + A.load(i+3UL,j3) * x4 ) * scalar;
4755 const IntrinsicType x1( x.load(j ) );
4756 const IntrinsicType x2( x.load(j1) );
4757 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4758 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4759 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 + A.load(i+2UL,j1) * x2 ) * scalar;
4760 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 + A.load(i+3UL,j1) * x2 ) * scalar;
4764 const IntrinsicType x1( x.load(j) );
4765 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4766 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4767 y[i+2UL] -=
sum( A.load(i+2UL,j) * x1 ) * scalar;
4768 y[i+3UL] -=
sum( A.load(i+3UL,j) * x1 ) * scalar;
4771 for( ; remainder && j<jend; ++j ) {
4772 y[i ] -= A(i ,j) * x[j] * scalar;
4773 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4774 y[i+2UL] -= A(i+2UL,j) * x[j] * scalar;
4775 y[i+3UL] -= A(i+3UL,j) * x[j] * scalar;
4779 for( ; (i+2UL) <= M; i+=2UL )
4781 const size_t jbegin( ( IsUpper<MT1>::value )
4782 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4784 const size_t jend( ( IsLower<MT1>::value )
4785 ?( IsStrictlyLower<MT1>::value ? i+1UL : i+2UL )
4789 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4798 const IntrinsicType x1( x.load(j ) );
4799 const IntrinsicType x2( x.load(j1) );
4800 const IntrinsicType x3( x.load(j2) );
4801 const IntrinsicType x4( x.load(j3) );
4802 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 + A.load(i ,j2) * x3 + A.load(i ,j3) * x4 ) * scalar;
4803 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 + A.load(i+1UL,j2) * x3 + A.load(i+1UL,j3) * x4 ) * scalar;
4808 const IntrinsicType x1( x.load(j ) );
4809 const IntrinsicType x2( x.load(j1) );
4810 y[i ] -=
sum( A.load(i ,j) * x1 + A.load(i ,j1) * x2 ) * scalar;
4811 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 + A.load(i+1UL,j1) * x2 ) * scalar;
4815 const IntrinsicType x1( x.load(j) );
4816 y[i ] -=
sum( A.load(i ,j) * x1 ) * scalar;
4817 y[i+1UL] -=
sum( A.load(i+1UL,j) * x1 ) * scalar;
4820 for( ; remainder && j<jend; ++j ) {
4821 y[i ] -= A(i ,j) * x[j] * scalar;
4822 y[i+1UL] -= A(i+1UL,j) * x[j] * scalar;
4828 const size_t jbegin( ( IsUpper<MT1>::value )
4829 ?( ( IsStrictlyUpper<MT1>::value ? i+1UL : i ) &
size_t(-
IT::size) )
4831 const size_t jend( ( IsLower<MT1>::value )
4832 ?( IsStrictlyLower<MT1>::value ? i : i+1UL )
4836 const size_t jpos( remainder ? ( jend &
size_t(-
IT::size) ) : jend );
4845 const IntrinsicType x1( x.load(j ) );
4846 const IntrinsicType x2( x.load(j1) );
4847 const IntrinsicType x3( x.load(j2) );
4848 const IntrinsicType x4( x.load(j3) );
4849 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 + A.load(i,j2) * x3 + A.load(i,j3) * x4 ) * scalar;
4854 const IntrinsicType x1( x.load(j ) );
4855 const IntrinsicType x2( x.load(j1) );
4856 y[i] -=
sum( A.load(i,j) * x1 + A.load(i,j1) * x2 ) * scalar;
4860 const IntrinsicType x1( x.load(j) );
4861 y[i] -=
sum( A.load(i,j) * x1 ) * scalar;
4864 for( ; remainder && j<jend; ++j ) {
4865 y[i] -= A(i,j) * x[j] * scalar;
4885 template<
typename VT1
4889 static inline typename DisableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4890 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4892 selectLargeSubAssignKernel( y, A, x, scalar );
4911 template<
typename VT1
4915 static inline typename EnableIf< UseBlasKernel<VT1,MT1,VT2,ST2> >::Type
4916 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
4920 if( IsTriangular<MT1>::value ) {
4922 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4923 subAssign( y, tmp );
4926 gemv( y, A, x, ET(-scalar), ET(1) );
4948 template<
typename VT1 >
4949 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4959 const ResultType tmp(
serial( rhs ) );
4960 multAssign( ~lhs, tmp );
4982 template<
typename VT1 >
4983 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
4984 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
4990 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
4991 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
4993 if( left.rows() == 0UL ) {
4996 else if( left.columns() == 0UL ) {
5027 template<
typename VT1 >
5028 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5029 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5039 const ResultType tmp( rhs );
5058 template<
typename VT1 >
5059 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5060 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5066 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5067 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5069 if( left.rows() == 0UL || left.columns() == 0UL ) {
5103 template<
typename VT1 >
5104 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5105 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5111 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
5112 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
5114 if( left.rows() == 0UL || left.columns() == 0UL ) {
5148 template<
typename VT1 >
5149 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5150 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
5160 const ResultType tmp( rhs );
5222 template<
typename T1
5224 inline const typename DisableIf< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >::Type
5259 template<
typename T1
5262 inline const typename EnableIf< IsMatMatMultExpr<T1>,
typename MultExprTrait<T1,T2>::Type >::Type
5269 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
5284 template<
typename MT,
typename VT >
5301 template<
typename MT,
typename VT >
5303 :
public IsTrue< And< IsAligned<MT>, IsAligned<VT> >::value >
5319 template<
typename MT,
typename VT,
bool AF >
5324 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type
5325 ,
typename SubvectorExprTrait<const VT,AF>::Type >::Type Type;
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
Compile time check for low-level access to constant data.This type trait tests whether the given data...
Definition: HasConstDataAccess.h:75
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:212
Header file for the Rows type trait.
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:329
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
Header file for basic type definitions.
DMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:240
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: ColumnVector.h:79
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:119
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:125
Header file for the IsSame and IsStrictlySame type traits.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:363
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:201
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Constraints on the storage order of matrix types.
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:200
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
size_t size() const
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:309
Header file for the IsComplexDouble type trait.
Constraint on the transpose flag of vector types.
Compile time check for low-level access to mutable data.This type trait tests whether the given data ...
Definition: HasMutableDataAccess.h:75
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:130
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:127
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:261
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:218
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:129
Header file for the IsLower type trait.
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:319
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:373
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_MATVECMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid matrix/vector ...
Definition: MatVecMultExpr.h:166
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:215
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:203
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:383
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
#define BLAZE_CONSTRAINT_MUST_NOT_BE_SYMMETRIC_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is a symmetric matrix type, a compilation error is created.
Definition: Symmetric.h:116
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: RowMajorMatrix.h:79
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:138
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: DMatDVecMultExpr.h:296
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:384
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:341
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:126
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:206
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:209
Constraint on the data type.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:353
Header file for the HasMutableDataAccess type trait.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:202
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for all intrinsic functionality.
Header file for BLAS general matrix/vector multiplication functions (gemv)
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDVecMultExpr.h:204
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
BLAZE_ALWAYS_INLINE size_t columns(const Matrix< MT, SO > &matrix)
Returns the current number of columns of the matrix.
Definition: Matrix.h:324
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:205
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Constraint on the data type.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:254
Header file for the IsUpper type trait.
Header file for exception macros.
Header file for the MatVecMultExpr base class.
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:128
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.