35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
116 template<
typename VT
118 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
119 ,
private TVecMatMultExpr
120 ,
private Computation
149 template<
typename T1 >
150 struct UseSMPAssign {
151 enum { value = ( evaluateVector || evaluateMatrix ) };
161 template<
typename T1,
typename T2,
typename T3 >
162 struct UseBlasKernel {
164 HasMutableDataAccess<T1>::value &&
165 HasConstDataAccess<T2>::value &&
166 HasConstDataAccess<T3>::value &&
167 !IsDiagonal<T3>::value &&
168 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
169 IsBlasCompatible<typename T1::ElementType>::value &&
170 IsBlasCompatible<typename T2::ElementType>::value &&
171 IsBlasCompatible<typename T3::ElementType>::value &&
172 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
173 IsSame< typename T1::ElementType, typename T3::ElementType >::value };
184 template<
typename T1,
typename T2,
typename T3 >
185 struct UseVectorizedDefaultKernel {
187 !IsDiagonal<T3>::value &&
188 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
189 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
190 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
191 IntrinsicTrait<typename T1::ElementType>::addition &&
192 IntrinsicTrait<typename T1::ElementType>::multiplication };
223 VT::vectorizable && MT::vectorizable &&
229 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
230 !evaluateMatrix && MT::smpAssignable };
262 return vec_[index] *
mat_(index,index);
272 const size_t inum( iend - ibegin );
273 const size_t ipos( ibegin + ( ( inum - 1UL ) &
size_t(-2) ) + 1UL );
275 ElementType res(
vec_[ibegin] *
mat_(ibegin,index) );
277 for(
size_t i=ibegin+1UL; i<ipos; i+=2UL ) {
281 res +=
vec_[ipos] *
mat_(ipos,index);
295 inline ReturnType
at(
size_t index )
const {
296 if( index >=
mat_.columns() ) {
299 return (*
this)[index];
309 return mat_.columns();
339 template<
typename T >
341 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
351 template<
typename T >
353 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
363 return vec_.isAligned() &&
mat_.isAligned();
375 (
mat_.rows() *
mat_.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
376 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
399 template<
typename VT1 >
406 if( rhs.mat_.rows() == 0UL ) {
410 else if( rhs.mat_.columns() == 0UL ) {
414 LT x(
serial( rhs.vec_ ) );
415 RT A(
serial( rhs.mat_ ) );
422 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
438 template<
typename VT1
441 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
445 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
446 selectSmallAssignKernel( y, x, A );
448 selectBlasAssignKernel( y, x, A );
467 template<
typename VT1
470 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
491 template<
typename VT1
494 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
495 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
497 selectDefaultAssignKernel( y, x, A );
516 template<
typename VT1
519 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
520 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
522 typedef IntrinsicTrait<ElementType> IT;
524 const size_t M( A.rows() );
525 const size_t N( A.columns() );
527 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
531 for( ; (j+8UL) <= N; j+=8UL )
533 const size_t ibegin( ( IsLower<MT1>::value )
534 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
536 const size_t iend( ( IsUpper<MT1>::value )
537 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
541 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
544 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
548 const IntrinsicType x1( x.load(i) );
549 xmm1 = xmm1 + x1 * A.load(i,j );
550 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
551 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
552 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
553 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
554 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
555 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
556 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
560 y[j+1UL] =
sum( xmm2 );
561 y[j+2UL] =
sum( xmm3 );
562 y[j+3UL] =
sum( xmm4 );
563 y[j+4UL] =
sum( xmm5 );
564 y[j+5UL] =
sum( xmm6 );
565 y[j+6UL] =
sum( xmm7 );
566 y[j+7UL] =
sum( xmm8 );
568 for( ; remainder && i<iend; ++i ) {
569 y[j ] += x[i] * A(i,j );
570 y[j+1UL] += x[i] * A(i,j+1UL);
571 y[j+2UL] += x[i] * A(i,j+2UL);
572 y[j+3UL] += x[i] * A(i,j+3UL);
573 y[j+4UL] += x[i] * A(i,j+4UL);
574 y[j+5UL] += x[i] * A(i,j+5UL);
575 y[j+6UL] += x[i] * A(i,j+6UL);
576 y[j+7UL] += x[i] * A(i,j+7UL);
580 for( ; (j+4UL) <= N; j+=4UL )
582 const size_t ibegin( ( IsLower<MT1>::value )
583 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
585 const size_t iend( ( IsUpper<MT1>::value )
586 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
590 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
593 IntrinsicType xmm1, xmm2, xmm3, xmm4;
597 const IntrinsicType x1( x.load(i) );
598 xmm1 = xmm1 + x1 * A.load(i,j );
599 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
600 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
601 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
605 y[j+1UL] =
sum( xmm2 );
606 y[j+2UL] =
sum( xmm3 );
607 y[j+3UL] =
sum( xmm4 );
609 for( ; remainder && i<iend; ++i ) {
610 y[j ] += x[i] * A(i,j );
611 y[j+1UL] += x[i] * A(i,j+1UL);
612 y[j+2UL] += x[i] * A(i,j+2UL);
613 y[j+3UL] += x[i] * A(i,j+3UL);
617 for( ; (j+3UL) <= N; j+=3UL )
619 const size_t ibegin( ( IsLower<MT1>::value )
620 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
622 const size_t iend( ( IsUpper<MT1>::value )
623 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
627 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
630 IntrinsicType xmm1, xmm2, xmm3;
634 const IntrinsicType x1( x.load(i) );
635 xmm1 = xmm1 + x1 * A.load(i,j );
636 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
637 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
641 y[j+1UL] =
sum( xmm2 );
642 y[j+2UL] =
sum( xmm3 );
644 for( ; remainder && i<iend; ++i ) {
645 y[j ] += x[i] * A(i,j );
646 y[j+1UL] += x[i] * A(i,j+1UL);
647 y[j+2UL] += x[i] * A(i,j+2UL);
651 for( ; (j+2UL) <= N; j+=2UL )
653 const size_t ibegin( ( IsLower<MT1>::value )
654 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
656 const size_t iend( ( IsUpper<MT1>::value )
657 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
661 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
664 IntrinsicType xmm1, xmm2;
668 const IntrinsicType x1( x.load(i) );
669 xmm1 = xmm1 + x1 * A.load(i,j );
670 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
674 y[j+1UL] =
sum( xmm2 );
676 for( ; remainder && i<iend; ++i ) {
677 y[j ] += x[i] * A(i,j );
678 y[j+1UL] += x[i] * A(i,j+1UL);
684 const size_t ibegin( ( IsLower<MT1>::value )
685 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
687 const size_t iend( ( IsUpper<MT1>::value )
688 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
692 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
699 xmm1 = xmm1 + x.load(i) * A.load(i,j);
704 for( ; remainder && i<iend; ++i ) {
705 y[j] += x[i] * A(i,j);
726 template<
typename VT1
729 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
730 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
732 selectDefaultAssignKernel( y, x, A );
751 template<
typename VT1
754 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
755 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
757 typedef IntrinsicTrait<ElementType> IT;
759 const size_t M( A.rows() );
760 const size_t N( A.columns() );
762 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
768 for( ; (j+8UL) <= N; j+=8UL )
770 const size_t ibegin( ( IsLower<MT1>::value )
771 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
773 const size_t iend( ( IsUpper<MT1>::value )
774 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
778 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
787 const IntrinsicType x1( x.load(i ) );
788 const IntrinsicType x2( x.load(i1) );
789 const IntrinsicType x3( x.load(i2) );
790 const IntrinsicType x4( x.load(i3) );
791 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
792 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
793 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
794 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
795 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
796 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
797 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
798 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
803 const IntrinsicType x1( x.load(i ) );
804 const IntrinsicType x2( x.load(i1) );
805 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
806 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
807 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
808 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
809 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
810 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
811 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
812 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
816 const IntrinsicType x1( x.load(i) );
817 y[j ] +=
sum( x1 * A.load(i,j ) );
818 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
819 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
820 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
821 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
822 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
823 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
824 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
827 for( ; remainder && i<iend; ++i ) {
828 y[j ] += x[i] * A(i,j );
829 y[j+1UL] += x[i] * A(i,j+1UL);
830 y[j+2UL] += x[i] * A(i,j+2UL);
831 y[j+3UL] += x[i] * A(i,j+3UL);
832 y[j+4UL] += x[i] * A(i,j+4UL);
833 y[j+5UL] += x[i] * A(i,j+5UL);
834 y[j+6UL] += x[i] * A(i,j+6UL);
835 y[j+7UL] += x[i] * A(i,j+7UL);
839 for( ; (j+4UL) <= N; j+=4UL )
841 const size_t ibegin( ( IsLower<MT1>::value )
842 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
844 const size_t iend( ( IsUpper<MT1>::value )
845 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
849 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
858 const IntrinsicType x1( x.load(i ) );
859 const IntrinsicType x2( x.load(i1) );
860 const IntrinsicType x3( x.load(i2) );
861 const IntrinsicType x4( x.load(i3) );
862 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
863 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
864 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
865 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
870 const IntrinsicType x1( x.load(i ) );
871 const IntrinsicType x2( x.load(i1) );
872 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
873 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
874 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
875 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
879 const IntrinsicType x1( x.load(i) );
880 y[j ] +=
sum( x1 * A.load(i,j ) );
881 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
882 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
883 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
886 for( ; remainder && i<iend; ++i ) {
887 y[j ] += x[i] * A(i,j );
888 y[j+1UL] += x[i] * A(i,j+1UL);
889 y[j+2UL] += x[i] * A(i,j+2UL);
890 y[j+3UL] += x[i] * A(i,j+3UL);
894 for( ; (j+2UL) <= N; j+=2UL )
896 const size_t ibegin( ( IsLower<MT1>::value )
897 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
899 const size_t iend( ( IsUpper<MT1>::value )
900 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
904 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
913 const IntrinsicType x1( x.load(i ) );
914 const IntrinsicType x2( x.load(i1) );
915 const IntrinsicType x3( x.load(i2) );
916 const IntrinsicType x4( x.load(i3) );
917 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
918 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
923 const IntrinsicType x1( x.load(i ) );
924 const IntrinsicType x2( x.load(i1) );
925 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
926 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
930 const IntrinsicType x1( x.load(i) );
931 y[j ] +=
sum( x1 * A.load(i,j ) );
932 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
935 for( ; remainder && i<iend; ++i ) {
936 y[j ] += x[i] * A(i,j );
937 y[j+1UL] += x[i] * A(i,j+1UL);
943 const size_t ibegin( ( IsLower<MT1>::value )
944 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
946 const size_t iend( ( IsUpper<MT1>::value )
947 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
951 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
960 const IntrinsicType x1( x.load(i ) );
961 const IntrinsicType x2( x.load(i1) );
962 const IntrinsicType x3( x.load(i2) );
963 const IntrinsicType x4( x.load(i3) );
964 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
969 const IntrinsicType x1( x.load(i ) );
970 const IntrinsicType x2( x.load(i1) );
971 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
975 const IntrinsicType x1( x.load(i) );
976 y[j] +=
sum( x1 * A.load(i,j) );
979 for( ; remainder && i<iend; ++i ) {
980 y[j] += x[i] * A(i,j);
1001 template<
typename VT1
1004 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1005 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1007 selectLargeAssignKernel( y, x, A );
1027 template<
typename VT1
1030 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1031 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1035 if( IsTriangular<MT1>::value ) {
1037 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1040 gemv( y, x, A, ET(1), ET(0) );
1060 template<
typename VT1 >
1061 friend inline void assign( SparseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1071 const ResultType tmp(
serial( rhs ) );
1072 assign( ~lhs, tmp );
1090 template<
typename VT1 >
1091 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1097 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1101 LT x(
serial( rhs.vec_ ) );
1102 RT A(
serial( rhs.mat_ ) );
1109 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1125 template<
typename VT1
1128 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1130 if( ( IsDiagonal<MT1>::value ) ||
1131 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1132 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1133 selectSmallAddAssignKernel( y, x, A );
1135 selectBlasAddAssignKernel( y, x, A );
1154 template<
typename VT1
1157 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1159 y.addAssign( x * A );
1178 template<
typename VT1
1181 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1182 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1184 selectDefaultAddAssignKernel( y, x, A );
1204 template<
typename VT1
1207 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1208 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1210 typedef IntrinsicTrait<ElementType> IT;
1212 const size_t M( A.rows() );
1213 const size_t N( A.columns() );
1215 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1219 for( ; (j+8UL) <= N; j+=8UL )
1221 const size_t ibegin( ( IsLower<MT1>::value )
1222 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1224 const size_t iend( ( IsUpper<MT1>::value )
1225 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1229 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1232 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1236 const IntrinsicType x1( x.load(i) );
1237 xmm1 = xmm1 + x1 * A.load(i,j );
1238 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1239 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1240 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1241 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1242 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1243 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1244 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1247 y[j ] +=
sum( xmm1 );
1248 y[j+1UL] +=
sum( xmm2 );
1249 y[j+2UL] +=
sum( xmm3 );
1250 y[j+3UL] +=
sum( xmm4 );
1251 y[j+4UL] +=
sum( xmm5 );
1252 y[j+5UL] +=
sum( xmm6 );
1253 y[j+6UL] +=
sum( xmm7 );
1254 y[j+7UL] +=
sum( xmm8 );
1256 for( ; remainder && i<iend; ++i ) {
1257 y[j ] += x[i] * A(i,j );
1258 y[j+1UL] += x[i] * A(i,j+1UL);
1259 y[j+2UL] += x[i] * A(i,j+2UL);
1260 y[j+3UL] += x[i] * A(i,j+3UL);
1261 y[j+4UL] += x[i] * A(i,j+4UL);
1262 y[j+5UL] += x[i] * A(i,j+5UL);
1263 y[j+6UL] += x[i] * A(i,j+6UL);
1264 y[j+7UL] += x[i] * A(i,j+7UL);
1268 for( ; (j+4UL) <= N; j+=4UL )
1270 const size_t ibegin( ( IsLower<MT1>::value )
1271 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1273 const size_t iend( ( IsUpper<MT1>::value )
1274 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1278 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1281 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1285 const IntrinsicType x1( x.load(i) );
1286 xmm1 = xmm1 + x1 * A.load(i,j );
1287 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1288 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1289 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1292 y[j ] +=
sum( xmm1 );
1293 y[j+1UL] +=
sum( xmm2 );
1294 y[j+2UL] +=
sum( xmm3 );
1295 y[j+3UL] +=
sum( xmm4 );
1297 for( ; remainder && i<iend; ++i ) {
1298 y[j ] += x[i] * A(i,j );
1299 y[j+1UL] += x[i] * A(i,j+1UL);
1300 y[j+2UL] += x[i] * A(i,j+2UL);
1301 y[j+3UL] += x[i] * A(i,j+3UL);
1305 for( ; (j+3UL) <= N; j+=3UL )
1307 const size_t ibegin( ( IsLower<MT1>::value )
1308 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1310 const size_t iend( ( IsUpper<MT1>::value )
1311 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1315 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1318 IntrinsicType xmm1, xmm2, xmm3;
1322 const IntrinsicType x1( x.load(i) );
1323 xmm1 = xmm1 + x1 * A.load(i,j );
1324 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1325 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1328 y[j ] +=
sum( xmm1 );
1329 y[j+1UL] +=
sum( xmm2 );
1330 y[j+2UL] +=
sum( xmm3 );
1332 for( ; remainder && i<iend; ++i ) {
1333 y[j ] += x[i] * A(i,j );
1334 y[j+1UL] += x[i] * A(i,j+1UL);
1335 y[j+2UL] += x[i] * A(i,j+2UL);
1339 for( ; (j+2UL) <= N; j+=2UL )
1341 const size_t ibegin( ( IsLower<MT1>::value )
1342 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1344 const size_t iend( ( IsUpper<MT1>::value )
1345 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1349 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1352 IntrinsicType xmm1, xmm2;
1356 const IntrinsicType x1( x.load(i) );
1357 xmm1 = xmm1 + x1 * A.load(i,j );
1358 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1361 y[j ] +=
sum( xmm1 );
1362 y[j+1UL] +=
sum( xmm2 );
1364 for( ; remainder && i<iend; ++i ) {
1365 y[j ] += x[i] * A(i,j );
1366 y[j+1UL] += x[i] * A(i,j+1UL);
1372 const size_t ibegin( ( IsLower<MT1>::value )
1373 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1375 const size_t iend( ( IsUpper<MT1>::value )
1376 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1380 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1387 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1390 y[j] +=
sum( xmm1 );
1392 for( ; remainder && i<iend; ++i ) {
1393 y[j] += x[i] * A(i,j);
1414 template<
typename VT1
1417 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1418 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1420 selectDefaultAddAssignKernel( y, x, A );
1440 template<
typename VT1
1443 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1444 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1446 typedef IntrinsicTrait<ElementType> IT;
1448 const size_t M( A.rows() );
1449 const size_t N( A.columns() );
1451 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1455 for( ; (j+8UL) <= N; j+=8UL )
1457 const size_t ibegin( ( IsLower<MT1>::value )
1458 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1460 const size_t iend( ( IsUpper<MT1>::value )
1461 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1465 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1474 const IntrinsicType x1( x.load(i ) );
1475 const IntrinsicType x2( x.load(i1) );
1476 const IntrinsicType x3( x.load(i2) );
1477 const IntrinsicType x4( x.load(i3) );
1478 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1479 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1480 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1481 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1482 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1483 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1484 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1485 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1490 const IntrinsicType x1( x.load(i ) );
1491 const IntrinsicType x2( x.load(i1) );
1492 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1493 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1494 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1495 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1496 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1497 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1498 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1499 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1503 const IntrinsicType x1( x.load(i) );
1504 y[j ] +=
sum( x1 * A.load(i,j ) );
1505 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1506 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1507 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1508 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1509 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1510 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1511 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1514 for( ; remainder && i<iend; ++i ) {
1515 y[j ] += x[i] * A(i,j );
1516 y[j+1UL] += x[i] * A(i,j+1UL);
1517 y[j+2UL] += x[i] * A(i,j+2UL);
1518 y[j+3UL] += x[i] * A(i,j+3UL);
1519 y[j+4UL] += x[i] * A(i,j+4UL);
1520 y[j+5UL] += x[i] * A(i,j+5UL);
1521 y[j+6UL] += x[i] * A(i,j+6UL);
1522 y[j+7UL] += x[i] * A(i,j+7UL);
1526 for( ; (j+4UL) <= N; j+=4UL )
1528 const size_t ibegin( ( IsLower<MT1>::value )
1529 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1531 const size_t iend( ( IsUpper<MT1>::value )
1532 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1536 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1545 const IntrinsicType x1( x.load(i ) );
1546 const IntrinsicType x2( x.load(i1) );
1547 const IntrinsicType x3( x.load(i2) );
1548 const IntrinsicType x4( x.load(i3) );
1549 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1550 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1551 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1552 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1557 const IntrinsicType x1( x.load(i ) );
1558 const IntrinsicType x2( x.load(i1) );
1559 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1560 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1561 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1562 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1566 const IntrinsicType x1( x.load(i) );
1567 y[j ] +=
sum( x1 * A.load(i,j ) );
1568 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1569 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1570 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1573 for( ; remainder && i<iend; ++i ) {
1574 y[j ] += x[i] * A(i,j );
1575 y[j+1UL] += x[i] * A(i,j+1UL);
1576 y[j+2UL] += x[i] * A(i,j+2UL);
1577 y[j+3UL] += x[i] * A(i,j+3UL);
1581 for( ; (j+2UL) <= N; j+=2UL )
1583 const size_t ibegin( ( IsLower<MT1>::value )
1584 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1586 const size_t iend( ( IsUpper<MT1>::value )
1587 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1591 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1600 const IntrinsicType x1( x.load(i ) );
1601 const IntrinsicType x2( x.load(i1) );
1602 const IntrinsicType x3( x.load(i2) );
1603 const IntrinsicType x4( x.load(i3) );
1604 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1605 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1610 const IntrinsicType x1( x.load(i ) );
1611 const IntrinsicType x2( x.load(i1) );
1612 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1613 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1617 const IntrinsicType x1( x.load(i) );
1618 y[j ] +=
sum( x1 * A.load(i,j ) );
1619 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1622 for( ; remainder && i<iend; ++i ) {
1623 y[j ] += x[i] * A(i,j );
1624 y[j+1UL] += x[i] * A(i,j+1UL);
1630 const size_t ibegin( ( IsLower<MT1>::value )
1631 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1633 const size_t iend( ( IsUpper<MT1>::value )
1634 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1638 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1647 const IntrinsicType x1( x.load(i ) );
1648 const IntrinsicType x2( x.load(i1) );
1649 const IntrinsicType x3( x.load(i2) );
1650 const IntrinsicType x4( x.load(i3) );
1651 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1656 const IntrinsicType x1( x.load(i ) );
1657 const IntrinsicType x2( x.load(i1) );
1658 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1662 const IntrinsicType x1( x.load(i) );
1663 y[j] +=
sum( x1 * A.load(i,j) );
1666 for( ; remainder && i<iend; ++i ) {
1667 y[j] += x[i] * A(i,j);
1688 template<
typename VT1
1691 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1692 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1694 selectLargeAddAssignKernel( y, x, A );
1714 template<
typename VT1
1717 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
1718 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1722 if( IsTriangular<MT1>::value ) {
1724 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1725 addAssign( y, tmp );
1728 gemv( y, x, A, ET(1), ET(1) );
1752 template<
typename VT1 >
1753 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
1759 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1763 LT x(
serial( rhs.vec_ ) );
1764 RT A(
serial( rhs.mat_ ) );
1771 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1787 template<
typename VT1
1790 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1792 if( ( IsDiagonal<MT1>::value ) ||
1793 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1794 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
1795 selectSmallSubAssignKernel( y, x, A );
1797 selectBlasSubAssignKernel( y, x, A );
1816 template<
typename VT1
1819 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1821 y.subAssign( x * A );
1840 template<
typename VT1
1843 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1844 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1846 selectDefaultSubAssignKernel( y, x, A );
1866 template<
typename VT1
1869 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1870 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1872 typedef IntrinsicTrait<ElementType> IT;
1874 const size_t M( A.rows() );
1875 const size_t N( A.columns() );
1877 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
1881 for( ; (j+8UL) <= N; j+=8UL )
1883 const size_t ibegin( ( IsLower<MT1>::value )
1884 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1886 const size_t iend( ( IsUpper<MT1>::value )
1887 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1891 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1894 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1898 const IntrinsicType x1( x.load(i) );
1899 xmm1 = xmm1 + x1 * A.load(i,j );
1900 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1901 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1902 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1903 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1904 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1905 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1906 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1909 y[j ] -=
sum( xmm1 );
1910 y[j+1UL] -=
sum( xmm2 );
1911 y[j+2UL] -=
sum( xmm3 );
1912 y[j+3UL] -=
sum( xmm4 );
1913 y[j+4UL] -=
sum( xmm5 );
1914 y[j+5UL] -=
sum( xmm6 );
1915 y[j+6UL] -=
sum( xmm7 );
1916 y[j+7UL] -=
sum( xmm8 );
1918 for( ; remainder && i<iend; ++i ) {
1919 y[j ] -= x[i] * A(i,j );
1920 y[j+1UL] -= x[i] * A(i,j+1UL);
1921 y[j+2UL] -= x[i] * A(i,j+2UL);
1922 y[j+3UL] -= x[i] * A(i,j+3UL);
1923 y[j+4UL] -= x[i] * A(i,j+4UL);
1924 y[j+5UL] -= x[i] * A(i,j+5UL);
1925 y[j+6UL] -= x[i] * A(i,j+6UL);
1926 y[j+7UL] -= x[i] * A(i,j+7UL);
1930 for( ; (j+4UL) <= N; j+=4UL )
1932 const size_t ibegin( ( IsLower<MT1>::value )
1933 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1935 const size_t iend( ( IsUpper<MT1>::value )
1936 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1940 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1943 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1947 const IntrinsicType x1( x.load(i) );
1948 xmm1 = xmm1 + x1 * A.load(i,j );
1949 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1950 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1951 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1954 y[j ] -=
sum( xmm1 );
1955 y[j+1UL] -=
sum( xmm2 );
1956 y[j+2UL] -=
sum( xmm3 );
1957 y[j+3UL] -=
sum( xmm4 );
1959 for( ; remainder && i<iend; ++i ) {
1960 y[j ] -= x[i] * A(i,j );
1961 y[j+1UL] -= x[i] * A(i,j+1UL);
1962 y[j+2UL] -= x[i] * A(i,j+2UL);
1963 y[j+3UL] -= x[i] * A(i,j+3UL);
1967 for( ; (j+3UL) <= N; j+=3UL )
1969 const size_t ibegin( ( IsLower<MT1>::value )
1970 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1972 const size_t iend( ( IsUpper<MT1>::value )
1973 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1977 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
1980 IntrinsicType xmm1, xmm2, xmm3;
1984 const IntrinsicType x1( x.load(i) );
1985 xmm1 = xmm1 + x1 * A.load(i,j );
1986 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1987 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1990 y[j ] -=
sum( xmm1 );
1991 y[j+1UL] -=
sum( xmm2 );
1992 y[j+2UL] -=
sum( xmm3 );
1994 for( ; remainder && i<iend; ++i ) {
1995 y[j ] -= x[i] * A(i,j );
1996 y[j+1UL] -= x[i] * A(i,j+1UL);
1997 y[j+2UL] -= x[i] * A(i,j+2UL);
2001 for( ; (j+2UL) <= N; j+=2UL )
2003 const size_t ibegin( ( IsLower<MT1>::value )
2004 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2006 const size_t iend( ( IsUpper<MT1>::value )
2007 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2011 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2014 IntrinsicType xmm1, xmm2;
2018 const IntrinsicType x1( x.load(i) );
2019 xmm1 = xmm1 + x1 * A.load(i,j );
2020 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2023 y[j ] -=
sum( xmm1 );
2024 y[j+1UL] -=
sum( xmm2 );
2026 for( ; remainder && i<iend; ++i ) {
2027 y[j ] -= x[i] * A(i,j );
2028 y[j+1UL] -= x[i] * A(i,j+1UL);
2034 const size_t ibegin( ( IsLower<MT1>::value )
2035 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2037 const size_t iend( ( IsUpper<MT1>::value )
2038 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2042 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2049 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2052 y[j] -=
sum( xmm1 );
2054 for( ; remainder && i<iend; ++i ) {
2055 y[j] -= x[i] * A(i,j);
2076 template<
typename VT1
2079 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2080 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2082 selectDefaultSubAssignKernel( y, x, A );
2102 template<
typename VT1
2105 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2106 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2108 typedef IntrinsicTrait<ElementType> IT;
2110 const size_t M( A.rows() );
2111 const size_t N( A.columns() );
2113 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
2117 for( ; (j+8UL) <= N; j+=8UL )
2119 const size_t ibegin( ( IsLower<MT1>::value )
2120 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2122 const size_t iend( ( IsUpper<MT1>::value )
2123 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2127 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2136 const IntrinsicType x1( x.load(i ) );
2137 const IntrinsicType x2( x.load(i1) );
2138 const IntrinsicType x3( x.load(i2) );
2139 const IntrinsicType x4( x.load(i3) );
2140 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2141 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2142 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2143 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2144 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2145 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2146 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2147 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2152 const IntrinsicType x1( x.load(i ) );
2153 const IntrinsicType x2( x.load(i1) );
2154 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2155 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2156 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2157 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2158 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2159 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2160 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2161 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2165 const IntrinsicType x1( x.load(i) );
2166 y[j ] -=
sum( x1 * A.load(i,j ) );
2167 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2168 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2169 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2170 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2171 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2172 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2173 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2176 for( ; remainder && i<iend; ++i ) {
2177 y[j ] -= x[i] * A(i,j );
2178 y[j+1UL] -= x[i] * A(i,j+1UL);
2179 y[j+2UL] -= x[i] * A(i,j+2UL);
2180 y[j+3UL] -= x[i] * A(i,j+3UL);
2181 y[j+4UL] -= x[i] * A(i,j+4UL);
2182 y[j+5UL] -= x[i] * A(i,j+5UL);
2183 y[j+6UL] -= x[i] * A(i,j+6UL);
2184 y[j+7UL] -= x[i] * A(i,j+7UL);
2188 for( ; (j+4UL) <= N; j+=4UL )
2190 const size_t ibegin( ( IsLower<MT1>::value )
2191 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2193 const size_t iend( ( IsUpper<MT1>::value )
2194 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2198 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2207 const IntrinsicType x1( x.load(i ) );
2208 const IntrinsicType x2( x.load(i1) );
2209 const IntrinsicType x3( x.load(i2) );
2210 const IntrinsicType x4( x.load(i3) );
2211 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2212 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2213 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2214 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2219 const IntrinsicType x1( x.load(i ) );
2220 const IntrinsicType x2( x.load(i1) );
2221 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2222 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2223 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2224 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2228 const IntrinsicType x1( x.load(i) );
2229 y[j ] -=
sum( x1 * A.load(i,j ) );
2230 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2231 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2232 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2235 for( ; remainder && i<iend; ++i ) {
2236 y[j ] -= x[i] * A(i,j );
2237 y[j+1UL] -= x[i] * A(i,j+1UL);
2238 y[j+2UL] -= x[i] * A(i,j+2UL);
2239 y[j+3UL] -= x[i] * A(i,j+3UL);
2243 for( ; (j+2UL) <= N; j+=2UL )
2245 const size_t ibegin( ( IsLower<MT1>::value )
2246 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2248 const size_t iend( ( IsUpper<MT1>::value )
2249 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2253 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2262 const IntrinsicType x1( x.load(i ) );
2263 const IntrinsicType x2( x.load(i1) );
2264 const IntrinsicType x3( x.load(i2) );
2265 const IntrinsicType x4( x.load(i3) );
2266 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2267 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2272 const IntrinsicType x1( x.load(i ) );
2273 const IntrinsicType x2( x.load(i1) );
2274 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2275 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2279 const IntrinsicType x1( x.load(i) );
2280 y[j ] -=
sum( x1 * A.load(i,j ) );
2281 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2284 for( ; remainder && i<iend; ++i ) {
2285 y[j ] -= x[i] * A(i,j );
2286 y[j+1UL] -= x[i] * A(i,j+1UL);
2292 const size_t ibegin( ( IsLower<MT1>::value )
2293 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2295 const size_t iend( ( IsUpper<MT1>::value )
2296 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2300 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
2309 const IntrinsicType x1( x.load(i ) );
2310 const IntrinsicType x2( x.load(i1) );
2311 const IntrinsicType x3( x.load(i2) );
2312 const IntrinsicType x4( x.load(i3) );
2313 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2318 const IntrinsicType x1( x.load(i ) );
2319 const IntrinsicType x2( x.load(i1) );
2320 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2324 const IntrinsicType x1( x.load(i) );
2325 y[j] -=
sum( x1 * A.load(i,j) );
2328 for( ; remainder && i<iend; ++i ) {
2329 y[j] -= x[i] * A(i,j);
2350 template<
typename VT1
2353 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
2354 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2356 selectLargeSubAssignKernel( y, x, A );
2376 template<
typename VT1
2379 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1> >::Type
2380 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2384 if( IsTriangular<MT1>::value ) {
2386 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2387 subAssign( y, tmp );
2390 gemv( y, x, A, ET(-1), ET(1) );
2414 template<
typename VT1 >
2415 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const TDVecTDMatMultExpr& rhs )
2425 const ResultType tmp(
serial( rhs ) );
2426 multAssign( ~lhs, tmp );
2450 template<
typename VT1 >
2451 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2458 if( rhs.mat_.rows() == 0UL ) {
2462 else if( rhs.mat_.columns() == 0UL ) {
2494 template<
typename VT1 >
2495 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2506 const ResultType tmp( rhs );
2527 template<
typename VT1 >
2528 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2535 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2571 template<
typename VT1 >
2572 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2579 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2615 template<
typename VT1 >
2616 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2627 const ResultType tmp( rhs );
2666 template<
typename VT
2670 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
2671 ,
private VecScalarMultExpr
2672 ,
private Computation
2676 typedef TDVecTDMatMultExpr<VT,MT> VMM;
2688 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2693 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2694 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2702 template<
typename T1 >
2703 struct UseSMPAssign {
2704 enum { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2712 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2713 struct UseBlasKernel {
2715 HasMutableDataAccess<T1>::value &&
2716 HasConstDataAccess<T2>::value &&
2717 HasConstDataAccess<T3>::value &&
2718 !IsDiagonal<T3>::value &&
2719 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2720 IsBlasCompatible<typename T1::ElementType>::value &&
2721 IsBlasCompatible<typename T2::ElementType>::value &&
2722 IsBlasCompatible<typename T3::ElementType>::value &&
2723 IsSame< typename T1::ElementType, typename T2::ElementType >::value &&
2724 IsSame< typename T1::ElementType, typename T3::ElementType >::value &&
2725 !( IsBuiltin<typename T1::ElementType>::value && IsComplex<T4>::value ) };
2734 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2735 struct UseVectorizedDefaultKernel {
2737 !IsDiagonal<T3>::value &&
2738 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2739 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2740 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2741 IsSame<typename T1::ElementType,T4>::value &&
2742 IntrinsicTrait<typename T1::ElementType>::addition &&
2743 IntrinsicTrait<typename T1::ElementType>::multiplication };
2749 typedef DVecScalarMultExpr<VMM,ST,true>
This;
2750 typedef typename MultTrait<RES,ST>::Type
ResultType;
2753 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2758 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
2764 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
2767 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
2772 enum { vectorizable = !IsDiagonal<MT>::value &&
2773 VT::vectorizable && MT::vectorizable &&
2774 IsSame<VET,MET>::value &&
2775 IsSame<VET,ST>::value &&
2776 IntrinsicTrait<VET>::addition &&
2777 IntrinsicTrait<VET>::multiplication };
2780 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
2781 !evaluateMatrix && MT::smpAssignable };
2790 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
2802 inline ReturnType
operator[](
size_t index )
const {
2804 return vector_[index] * scalar_;
2815 inline ReturnType
at(
size_t index )
const {
2816 if( index >= vector_.size() ) {
2819 return (*
this)[index];
2828 inline size_t size()
const {
2829 return vector_.size();
2859 template<
typename T >
2860 inline bool canAlias(
const T* alias )
const {
2861 return vector_.canAlias( alias );
2871 template<
typename T >
2872 inline bool isAliased(
const T* alias )
const {
2873 return vector_.isAliased( alias );
2883 return vector_.isAligned();
2893 typename VMM::RightOperand A( vector_.rightOperand() );
2895 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2896 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) ) &&
2897 (
size() > SMP_TDVECTDMATMULT_THRESHOLD );
2903 LeftOperand vector_;
2904 RightOperand scalar_;
2919 template<
typename VT1
2921 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2927 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2928 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2930 if( right.rows() == 0UL ) {
2934 else if( right.columns() == 0UL ) {
2946 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
2961 template<
typename VT1
2965 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2967 if( ( IsDiagonal<MT1>::value ) ||
2968 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2969 ( A.rows() * A.columns() < TDVECTDMATMULT_THRESHOLD ) )
2970 selectSmallAssignKernel( y, x, A, scalar );
2972 selectBlasAssignKernel( y, x, A, scalar );
2990 template<
typename VT1
2994 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2996 y.assign( x * A * scalar );
3014 template<
typename VT1
3018 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3019 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3021 selectDefaultAssignKernel( y, x, A, scalar );
3040 template<
typename VT1
3044 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3045 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3047 typedef IntrinsicTrait<ElementType> IT;
3049 const size_t M( A.rows() );
3050 const size_t N( A.columns() );
3052 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3056 for( ; (j+8UL) <= N; j+=8UL )
3058 const size_t ibegin( ( IsLower<MT1>::value )
3059 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3061 const size_t iend( ( IsUpper<MT1>::value )
3062 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3066 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3069 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3073 const IntrinsicType x1( x.load(i) );
3074 xmm1 = xmm1 + x1 * A.load(i,j );
3075 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3076 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3077 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3078 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3079 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3080 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3081 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3084 y[j ] =
sum( xmm1 ) * scalar;
3085 y[j+1UL] =
sum( xmm2 ) * scalar;
3086 y[j+2UL] =
sum( xmm3 ) * scalar;
3087 y[j+3UL] =
sum( xmm4 ) * scalar;
3088 y[j+4UL] =
sum( xmm5 ) * scalar;
3089 y[j+5UL] =
sum( xmm6 ) * scalar;
3090 y[j+6UL] =
sum( xmm7 ) * scalar;
3091 y[j+7UL] =
sum( xmm8 ) * scalar;
3093 for( ; remainder && i<iend; ++i ) {
3094 y[j ] += x[i] * A(i,j ) * scalar;
3095 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3096 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3097 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3098 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3099 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3100 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3101 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3105 for( ; (j+4UL) <= N; j+=4UL )
3107 const size_t ibegin( ( IsLower<MT1>::value )
3108 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3110 const size_t iend( ( IsUpper<MT1>::value )
3111 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3115 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3118 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3122 const IntrinsicType x1( x.load(i) );
3123 xmm1 = xmm1 + x1 * A.load(i,j );
3124 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3125 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3126 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3129 y[j ] =
sum( xmm1 ) * scalar;
3130 y[j+1UL] =
sum( xmm2 ) * scalar;
3131 y[j+2UL] =
sum( xmm3 ) * scalar;
3132 y[j+3UL] =
sum( xmm4 ) * scalar;
3134 for( ; remainder && i<iend; ++i ) {
3135 y[j ] += x[i] * A(i,j ) * scalar;
3136 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3137 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3138 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3142 for( ; (j+3UL) <= N; j+=3UL )
3144 const size_t ibegin( ( IsLower<MT1>::value )
3145 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3147 const size_t iend( ( IsUpper<MT1>::value )
3148 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3152 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3155 IntrinsicType xmm1, xmm2, xmm3;
3159 const IntrinsicType x1( x.load(i) );
3160 xmm1 = xmm1 + x1 * A.load(i,j );
3161 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3162 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3165 y[j ] =
sum( xmm1 ) * scalar;
3166 y[j+1UL] =
sum( xmm2 ) * scalar;
3167 y[j+2UL] =
sum( xmm3 ) * scalar;
3169 for( ; remainder && i<iend; ++i ) {
3170 y[j ] += x[i] * A(i,j ) * scalar;
3171 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3172 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3176 for( ; (j+2UL) <= N; j+=2UL )
3178 const size_t ibegin( ( IsLower<MT1>::value )
3179 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3181 const size_t iend( ( IsUpper<MT1>::value )
3182 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3186 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3189 IntrinsicType xmm1, xmm2;
3193 const IntrinsicType x1( x.load(i) );
3194 xmm1 = xmm1 + x1 * A.load(i,j );
3195 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3198 y[j ] =
sum( xmm1 ) * scalar;
3199 y[j+1UL] =
sum( xmm2 ) * scalar;
3201 for( ; remainder && i<iend; ++i ) {
3202 y[j ] += x[i] * A(i,j ) * scalar;
3203 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3209 const size_t ibegin( ( IsLower<MT1>::value )
3210 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3212 const size_t iend( ( IsUpper<MT1>::value )
3213 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3217 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3224 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3227 y[j] =
sum( xmm1 ) * scalar;
3229 for( ; remainder && i<iend; ++i ) {
3230 y[j] += x[i] * A(i,j) * scalar;
3250 template<
typename VT1
3254 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3255 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3257 selectDefaultAssignKernel( y, x, A, scalar );
3276 template<
typename VT1
3280 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3281 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3283 typedef IntrinsicTrait<ElementType> IT;
3285 const size_t M( A.rows() );
3286 const size_t N( A.columns() );
3288 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3294 for( ; (j+8UL) <= N; j+=8UL )
3296 const size_t ibegin( ( IsLower<MT1>::value )
3297 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3299 const size_t iend( ( IsUpper<MT1>::value )
3300 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3304 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3313 const IntrinsicType x1( x.load(i ) );
3314 const IntrinsicType x2( x.load(i1) );
3315 const IntrinsicType x3( x.load(i2) );
3316 const IntrinsicType x4( x.load(i3) );
3317 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3318 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3319 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3320 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3321 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3322 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3323 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3324 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3329 const IntrinsicType x1( x.load(i ) );
3330 const IntrinsicType x2( x.load(i1) );
3331 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3332 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3333 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3334 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3335 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3336 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3337 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3338 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3342 const IntrinsicType x1( x.load(i) );
3343 y[j ] +=
sum( x1 * A.load(i,j ) );
3344 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3345 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3346 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3347 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3348 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3349 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3350 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3353 for( ; remainder && i<iend; ++i ) {
3354 y[j ] += x[i] * A(i,j );
3355 y[j+1UL] += x[i] * A(i,j+1UL);
3356 y[j+2UL] += x[i] * A(i,j+2UL);
3357 y[j+3UL] += x[i] * A(i,j+3UL);
3358 y[j+4UL] += x[i] * A(i,j+4UL);
3359 y[j+5UL] += x[i] * A(i,j+5UL);
3360 y[j+6UL] += x[i] * A(i,j+6UL);
3361 y[j+7UL] += x[i] * A(i,j+7UL);
3374 for( ; (j+4UL) <= N; j+=4UL )
3376 const size_t ibegin( ( IsLower<MT1>::value )
3377 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3379 const size_t iend( ( IsUpper<MT1>::value )
3380 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3384 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3393 const IntrinsicType x1( x.load(i ) );
3394 const IntrinsicType x2( x.load(i1) );
3395 const IntrinsicType x3( x.load(i2) );
3396 const IntrinsicType x4( x.load(i3) );
3397 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3398 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3399 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3400 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3405 const IntrinsicType x1( x.load(i ) );
3406 const IntrinsicType x2( x.load(i1) );
3407 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3408 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3409 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3410 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3414 const IntrinsicType x1( x.load(i) );
3415 y[j ] +=
sum( x1 * A.load(i,j ) );
3416 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3417 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3418 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3421 for( ; remainder && i<iend; ++i ) {
3422 y[j ] += x[i] * A(i,j );
3423 y[j+1UL] += x[i] * A(i,j+1UL);
3424 y[j+2UL] += x[i] * A(i,j+2UL);
3425 y[j+3UL] += x[i] * A(i,j+3UL);
3434 for( ; (j+2UL) <= N; j+=2UL )
3436 const size_t ibegin( ( IsLower<MT1>::value )
3437 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3439 const size_t iend( ( IsUpper<MT1>::value )
3440 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3444 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3453 const IntrinsicType x1( x.load(i ) );
3454 const IntrinsicType x2( x.load(i1) );
3455 const IntrinsicType x3( x.load(i2) );
3456 const IntrinsicType x4( x.load(i3) );
3457 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3458 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3463 const IntrinsicType x1( x.load(i ) );
3464 const IntrinsicType x2( x.load(i1) );
3465 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3466 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3470 const IntrinsicType x1( x.load(i) );
3471 y[j ] +=
sum( x1 * A.load(i,j ) );
3472 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3475 for( ; remainder && i<iend; ++i ) {
3476 y[j ] += x[i] * A(i,j );
3477 y[j+1UL] += x[i] * A(i,j+1UL);
3486 const size_t ibegin( ( IsLower<MT1>::value )
3487 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3489 const size_t iend( ( IsUpper<MT1>::value )
3490 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3494 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3503 const IntrinsicType x1( x.load(i ) );
3504 const IntrinsicType x2( x.load(i1) );
3505 const IntrinsicType x3( x.load(i2) );
3506 const IntrinsicType x4( x.load(i3) );
3507 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3512 const IntrinsicType x1( x.load(i ) );
3513 const IntrinsicType x2( x.load(i1) );
3514 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3518 const IntrinsicType x1( x.load(i) );
3519 y[j] +=
sum( x1 * A.load(i,j) );
3522 for( ; remainder && i<iend; ++i ) {
3523 y[j] += x[i] * A(i,j);
3544 template<
typename VT1
3548 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
3549 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3551 selectLargeAssignKernel( y, x, A, scalar );
3570 template<
typename VT1
3574 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
3575 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3579 if( IsTriangular<MT1>::value ) {
3580 assign( y, scalar * x );
3581 trmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3584 gemv( y, x, A, ET(scalar), ET(0) );
3602 template<
typename VT1
3604 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3614 const ResultType tmp(
serial( rhs ) );
3615 assign( ~lhs, tmp );
3631 template<
typename VT1
3633 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3639 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3640 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3642 if( right.rows() == 0UL || right.columns() == 0UL ) {
3654 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3669 template<
typename VT1
3673 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3675 if( ( IsDiagonal<MT1>::value ) ||
3676 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3677 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
3678 selectSmallAddAssignKernel( y, x, A, scalar );
3680 selectBlasAddAssignKernel( y, x, A, scalar );
3698 template<
typename VT1
3702 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3704 y.addAssign( x * A * scalar );
3722 template<
typename VT1
3726 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3727 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3729 selectDefaultAddAssignKernel( y, x, A, scalar );
3748 template<
typename VT1
3752 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3753 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3755 typedef IntrinsicTrait<ElementType> IT;
3757 const size_t M( A.rows() );
3758 const size_t N( A.columns() );
3760 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
3764 for( ; (j+8UL) <= N; j+=8UL )
3766 const size_t ibegin( ( IsLower<MT1>::value )
3767 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3769 const size_t iend( ( IsUpper<MT1>::value )
3770 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3774 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3777 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3781 const IntrinsicType x1( x.load(i) );
3782 xmm1 = xmm1 + x1 * A.load(i,j );
3783 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3784 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3785 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3786 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3787 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3788 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3789 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3792 y[j ] +=
sum( xmm1 ) * scalar;
3793 y[j+1UL] +=
sum( xmm2 ) * scalar;
3794 y[j+2UL] +=
sum( xmm3 ) * scalar;
3795 y[j+3UL] +=
sum( xmm4 ) * scalar;
3796 y[j+4UL] +=
sum( xmm5 ) * scalar;
3797 y[j+5UL] +=
sum( xmm6 ) * scalar;
3798 y[j+6UL] +=
sum( xmm7 ) * scalar;
3799 y[j+7UL] +=
sum( xmm8 ) * scalar;
3801 for( ; remainder && i<iend; ++i ) {
3802 y[j ] += x[i] * A(i,j ) * scalar;
3803 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3804 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3805 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3806 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
3807 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
3808 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
3809 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
3813 for( ; (j+4UL) <= N; j+=4UL )
3815 const size_t ibegin( ( IsLower<MT1>::value )
3816 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3818 const size_t iend( ( IsUpper<MT1>::value )
3819 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3823 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3826 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3830 const IntrinsicType x1( x.load(i) );
3831 xmm1 = xmm1 + x1 * A.load(i,j );
3832 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3833 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3834 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3837 y[j ] +=
sum( xmm1 ) * scalar;
3838 y[j+1UL] +=
sum( xmm2 ) * scalar;
3839 y[j+2UL] +=
sum( xmm3 ) * scalar;
3840 y[j+3UL] +=
sum( xmm4 ) * scalar;
3842 for( ; remainder && i<iend; ++i ) {
3843 y[j ] += x[i] * A(i,j ) * scalar;
3844 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3845 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3846 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
3850 for( ; (j+3UL) <= N; j+=3UL )
3852 const size_t ibegin( ( IsLower<MT1>::value )
3853 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3855 const size_t iend( ( IsUpper<MT1>::value )
3856 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3860 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3863 IntrinsicType xmm1, xmm2, xmm3;
3867 const IntrinsicType x1( x.load(i) );
3868 xmm1 = xmm1 + x1 * A.load(i,j );
3869 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3870 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3873 y[j ] +=
sum( xmm1 ) * scalar;
3874 y[j+1UL] +=
sum( xmm2 ) * scalar;
3875 y[j+2UL] +=
sum( xmm3 ) * scalar;
3877 for( ; remainder && i<iend; ++i ) {
3878 y[j ] += x[i] * A(i,j ) * scalar;
3879 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3880 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
3884 for( ; (j+2UL) <= N; j+=2UL )
3886 const size_t ibegin( ( IsLower<MT1>::value )
3887 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3889 const size_t iend( ( IsUpper<MT1>::value )
3890 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3894 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3897 IntrinsicType xmm1, xmm2;
3901 const IntrinsicType x1( x.load(i) );
3902 xmm1 = xmm1 + x1 * A.load(i,j );
3903 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3906 y[j ] +=
sum( xmm1 ) * scalar;
3907 y[j+1UL] +=
sum( xmm2 ) * scalar;
3909 for( ; remainder && i<iend; ++i ) {
3910 y[j ] += x[i] * A(i,j ) * scalar;
3911 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
3917 const size_t ibegin( ( IsLower<MT1>::value )
3918 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3920 const size_t iend( ( IsUpper<MT1>::value )
3921 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3925 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
3932 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3935 y[j] +=
sum( xmm1 ) * scalar;
3937 for( ; remainder && i<iend; ++i ) {
3938 y[j] += x[i] * A(i,j) * scalar;
3958 template<
typename VT1
3962 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3963 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3965 selectDefaultAddAssignKernel( y, x, A, scalar );
3984 template<
typename VT1
3988 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3989 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3991 typedef IntrinsicTrait<ElementType> IT;
3993 const size_t M( A.rows() );
3994 const size_t N( A.columns() );
3996 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4000 for( ; (j+8UL) <= N; j+=8UL )
4002 const size_t ibegin( ( IsLower<MT1>::value )
4003 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4005 const size_t iend( ( IsUpper<MT1>::value )
4006 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4010 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4019 const IntrinsicType x1( x.load(i ) );
4020 const IntrinsicType x2( x.load(i1) );
4021 const IntrinsicType x3( x.load(i2) );
4022 const IntrinsicType x4( x.load(i3) );
4023 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4024 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4025 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4026 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4027 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4028 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4029 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4030 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4035 const IntrinsicType x1( x.load(i ) );
4036 const IntrinsicType x2( x.load(i1) );
4037 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4038 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4039 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4040 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4041 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4042 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4043 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4044 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4048 const IntrinsicType x1( x.load(i) );
4049 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4050 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4051 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4052 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4053 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4054 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4055 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4056 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4059 for( ; remainder && i<iend; ++i ) {
4060 y[j ] += x[i] * A(i,j ) * scalar;
4061 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4062 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4063 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4064 y[j+4UL] += x[i] * A(i,j+4UL) * scalar;
4065 y[j+5UL] += x[i] * A(i,j+5UL) * scalar;
4066 y[j+6UL] += x[i] * A(i,j+6UL) * scalar;
4067 y[j+7UL] += x[i] * A(i,j+7UL) * scalar;
4071 for( ; (j+4UL) <= N; j+=4UL )
4073 const size_t ibegin( ( IsLower<MT1>::value )
4074 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4076 const size_t iend( ( IsUpper<MT1>::value )
4077 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4081 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4090 const IntrinsicType x1( x.load(i ) );
4091 const IntrinsicType x2( x.load(i1) );
4092 const IntrinsicType x3( x.load(i2) );
4093 const IntrinsicType x4( x.load(i3) );
4094 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4095 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4096 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4097 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4102 const IntrinsicType x1( x.load(i ) );
4103 const IntrinsicType x2( x.load(i1) );
4104 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4105 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4106 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4107 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4111 const IntrinsicType x1( x.load(i) );
4112 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4113 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4114 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4115 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4118 for( ; remainder && i<iend; ++i ) {
4119 y[j ] += x[i] * A(i,j ) * scalar;
4120 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4121 y[j+2UL] += x[i] * A(i,j+2UL) * scalar;
4122 y[j+3UL] += x[i] * A(i,j+3UL) * scalar;
4126 for( ; (j+2UL) <= N; j+=2UL )
4128 const size_t ibegin( ( IsLower<MT1>::value )
4129 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4131 const size_t iend( ( IsUpper<MT1>::value )
4132 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4136 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4145 const IntrinsicType x1( x.load(i ) );
4146 const IntrinsicType x2( x.load(i1) );
4147 const IntrinsicType x3( x.load(i2) );
4148 const IntrinsicType x4( x.load(i3) );
4149 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4150 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4155 const IntrinsicType x1( x.load(i ) );
4156 const IntrinsicType x2( x.load(i1) );
4157 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4158 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4162 const IntrinsicType x1( x.load(i) );
4163 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4164 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4167 for( ; remainder && i<iend; ++i ) {
4168 y[j ] += x[i] * A(i,j ) * scalar;
4169 y[j+1UL] += x[i] * A(i,j+1UL) * scalar;
4175 const size_t ibegin( ( IsLower<MT1>::value )
4176 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4178 const size_t iend( ( IsUpper<MT1>::value )
4179 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4183 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4192 const IntrinsicType x1( x.load(i ) );
4193 const IntrinsicType x2( x.load(i1) );
4194 const IntrinsicType x3( x.load(i2) );
4195 const IntrinsicType x4( x.load(i3) );
4196 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4201 const IntrinsicType x1( x.load(i ) );
4202 const IntrinsicType x2( x.load(i1) );
4203 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4207 const IntrinsicType x1( x.load(i) );
4208 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4211 for( ; remainder && i<iend; ++i ) {
4212 y[j] += x[i] * A(i,j) * scalar;
4233 template<
typename VT1
4237 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4238 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4240 selectLargeAddAssignKernel( y, x, A, scalar );
4259 template<
typename VT1
4263 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4264 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4268 if( IsTriangular<MT1>::value ) {
4270 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4271 addAssign( y, tmp );
4274 gemv( y, x, A, ET(scalar), ET(1) );
4296 template<
typename VT1
4298 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4304 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
4305 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
4307 if( right.rows() == 0UL || right.columns() == 0UL ) {
4319 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4334 template<
typename VT1
4338 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4340 if( ( IsDiagonal<MT1>::value ) ||
4341 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4342 ( A.rows() * A.columns() < TDVECDMATMULT_THRESHOLD ) )
4343 selectSmallSubAssignKernel( y, x, A, scalar );
4345 selectBlasSubAssignKernel( y, x, A, scalar );
4363 template<
typename VT1
4367 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4369 y.subAssign( x * A * scalar );
4387 template<
typename VT1
4391 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4392 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4394 selectDefaultSubAssignKernel( y, x, A, scalar );
4413 template<
typename VT1
4417 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4418 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4420 typedef IntrinsicTrait<ElementType> IT;
4422 const size_t M( A.rows() );
4423 const size_t N( A.columns() );
4425 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4429 for( ; (j+8UL) <= N; j+=8UL )
4431 const size_t ibegin( ( IsLower<MT1>::value )
4432 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4434 const size_t iend( ( IsUpper<MT1>::value )
4435 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4439 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4442 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4446 const IntrinsicType x1( x.load(i) );
4447 xmm1 = xmm1 + x1 * A.load(i,j );
4448 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4449 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4450 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4451 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
4452 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
4453 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
4454 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
4457 y[j ] -=
sum( xmm1 ) * scalar;
4458 y[j+1UL] -=
sum( xmm2 ) * scalar;
4459 y[j+2UL] -=
sum( xmm3 ) * scalar;
4460 y[j+3UL] -=
sum( xmm4 ) * scalar;
4461 y[j+4UL] -=
sum( xmm5 ) * scalar;
4462 y[j+5UL] -=
sum( xmm6 ) * scalar;
4463 y[j+6UL] -=
sum( xmm7 ) * scalar;
4464 y[j+7UL] -=
sum( xmm8 ) * scalar;
4466 for( ; remainder && i<iend; ++i ) {
4467 y[j ] -= x[i] * A(i,j ) * scalar;
4468 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4469 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4470 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4471 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4472 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4473 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4474 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4478 for( ; (j+4UL) <= N; j+=4UL )
4480 const size_t ibegin( ( IsLower<MT1>::value )
4481 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4483 const size_t iend( ( IsUpper<MT1>::value )
4484 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4488 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4491 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4495 const IntrinsicType x1( x.load(i) );
4496 xmm1 = xmm1 + x1 * A.load(i,j );
4497 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4498 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4499 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4502 y[j ] -=
sum( xmm1 ) * scalar;
4503 y[j+1UL] -=
sum( xmm2 ) * scalar;
4504 y[j+2UL] -=
sum( xmm3 ) * scalar;
4505 y[j+3UL] -=
sum( xmm4 ) * scalar;
4507 for( ; remainder && i<iend; ++i ) {
4508 y[j ] -= x[i] * A(i,j ) * scalar;
4509 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4510 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4511 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4515 for( ; (j+3UL) <= N; j+=3UL )
4517 const size_t ibegin( ( IsLower<MT1>::value )
4518 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4520 const size_t iend( ( IsUpper<MT1>::value )
4521 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4525 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4528 IntrinsicType xmm1, xmm2, xmm3;
4532 const IntrinsicType x1( x.load(i) );
4533 xmm1 = xmm1 + x1 * A.load(i,j );
4534 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4535 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4538 y[j ] -=
sum( xmm1 ) * scalar;
4539 y[j+1UL] -=
sum( xmm2 ) * scalar;
4540 y[j+2UL] -=
sum( xmm3 ) * scalar;
4542 for( ; remainder && i<iend; ++i ) {
4543 y[j ] -= x[i] * A(i,j ) * scalar;
4544 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4545 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4549 for( ; (j+2UL) <= N; j+=2UL )
4551 const size_t ibegin( ( IsLower<MT1>::value )
4552 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4554 const size_t iend( ( IsUpper<MT1>::value )
4555 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4559 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4562 IntrinsicType xmm1, xmm2;
4566 const IntrinsicType x1( x.load(i) );
4567 xmm1 = xmm1 + x1 * A.load(i,j );
4568 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4571 y[j ] -=
sum( xmm1 ) * scalar;
4572 y[j+1UL] -=
sum( xmm2 ) * scalar;
4574 for( ; remainder && i<iend; ++i ) {
4575 y[j ] -= x[i] * A(i,j ) * scalar;
4576 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4582 const size_t ibegin( ( IsLower<MT1>::value )
4583 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4585 const size_t iend( ( IsUpper<MT1>::value )
4586 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4590 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4597 xmm1 = xmm1 + A.load(i,j) * x.load(i);
4600 y[j] -=
sum( xmm1 ) * scalar;
4602 for( ; remainder && i<iend; ++i ) {
4603 y[j] -= x[i] * A(i,j) * scalar;
4623 template<
typename VT1
4627 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4628 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4630 selectDefaultSubAssignKernel( y, x, A, scalar );
4649 template<
typename VT1
4653 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4654 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4656 typedef IntrinsicTrait<ElementType> IT;
4658 const size_t M( A.rows() );
4659 const size_t N( A.columns() );
4661 const bool remainder( !IsPadded<VT2>::value || !IsPadded<MT1>::value );
4665 for( ; (j+8UL) <= N; j+=8UL )
4667 const size_t ibegin( ( IsLower<MT1>::value )
4668 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4670 const size_t iend( ( IsUpper<MT1>::value )
4671 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4675 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4684 const IntrinsicType x1( x.load(i ) );
4685 const IntrinsicType x2( x.load(i1) );
4686 const IntrinsicType x3( x.load(i2) );
4687 const IntrinsicType x4( x.load(i3) );
4688 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4689 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4690 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4691 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4692 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4693 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4694 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4695 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4700 const IntrinsicType x1( x.load(i ) );
4701 const IntrinsicType x2( x.load(i1) );
4702 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4703 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4704 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4705 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4706 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4707 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4708 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4709 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4713 const IntrinsicType x1( x.load(i) );
4714 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4715 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4716 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4717 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4718 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4719 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4720 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4721 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4724 for( ; remainder && i<iend; ++i ) {
4725 y[j ] -= x[i] * A(i,j ) * scalar;
4726 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4727 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4728 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4729 y[j+4UL] -= x[i] * A(i,j+4UL) * scalar;
4730 y[j+5UL] -= x[i] * A(i,j+5UL) * scalar;
4731 y[j+6UL] -= x[i] * A(i,j+6UL) * scalar;
4732 y[j+7UL] -= x[i] * A(i,j+7UL) * scalar;
4736 for( ; (j+4UL) <= N; j+=4UL )
4738 const size_t ibegin( ( IsLower<MT1>::value )
4739 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4741 const size_t iend( ( IsUpper<MT1>::value )
4742 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4746 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4755 const IntrinsicType x1( x.load(i ) );
4756 const IntrinsicType x2( x.load(i1) );
4757 const IntrinsicType x3( x.load(i2) );
4758 const IntrinsicType x4( x.load(i3) );
4759 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4760 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4761 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4762 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4767 const IntrinsicType x1( x.load(i ) );
4768 const IntrinsicType x2( x.load(i1) );
4769 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4770 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4771 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4772 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4776 const IntrinsicType x1( x.load(i) );
4777 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4778 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4779 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4780 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4783 for( ; remainder && i<iend; ++i ) {
4784 y[j ] -= x[i] * A(i,j ) * scalar;
4785 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4786 y[j+2UL] -= x[i] * A(i,j+2UL) * scalar;
4787 y[j+3UL] -= x[i] * A(i,j+3UL) * scalar;
4791 for( ; (j+2UL) <= N; j+=2UL )
4793 const size_t ibegin( ( IsLower<MT1>::value )
4794 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4796 const size_t iend( ( IsUpper<MT1>::value )
4797 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4801 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4810 const IntrinsicType x1( x.load(i ) );
4811 const IntrinsicType x2( x.load(i1) );
4812 const IntrinsicType x3( x.load(i2) );
4813 const IntrinsicType x4( x.load(i3) );
4814 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4815 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4820 const IntrinsicType x1( x.load(i ) );
4821 const IntrinsicType x2( x.load(i1) );
4822 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4823 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4827 const IntrinsicType x1( x.load(i) );
4828 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4829 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4832 for( ; remainder && i<iend; ++i ) {
4833 y[j ] -= x[i] * A(i,j ) * scalar;
4834 y[j+1UL] -= x[i] * A(i,j+1UL) * scalar;
4840 const size_t ibegin( ( IsLower<MT1>::value )
4841 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4843 const size_t iend( ( IsUpper<MT1>::value )
4844 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4848 const size_t ipos( remainder ? ( iend &
size_t(-
IT::size) ) : iend );
4857 const IntrinsicType x1( x.load(i ) );
4858 const IntrinsicType x2( x.load(i1) );
4859 const IntrinsicType x3( x.load(i2) );
4860 const IntrinsicType x4( x.load(i3) );
4861 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4866 const IntrinsicType x1( x.load(i ) );
4867 const IntrinsicType x2( x.load(i1) );
4868 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4872 const IntrinsicType x1( x.load(i) );
4873 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4876 for( ; remainder && i<iend; ++i ) {
4877 y[j] -= x[i] * A(i,j) * scalar;
4898 template<
typename VT1
4902 static inline typename DisableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4903 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4905 selectLargeSubAssignKernel( y, x, A, scalar );
4924 template<
typename VT1
4928 static inline typename EnableIf< UseBlasKernel<VT1,VT2,MT1,ST2> >::Type
4929 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4933 if( IsTriangular<MT1>::value ) {
4935 trmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4936 subAssign( y, tmp );
4939 gemv( y, x, A, ET(-scalar), ET(1) );
4961 template<
typename VT1
4963 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4973 const ResultType tmp(
serial( rhs ) );
4974 multAssign( ~lhs, tmp );
4996 template<
typename VT1
4998 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
4999 smpAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5005 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5006 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5008 if( right.rows() == 0UL ) {
5012 else if( right.columns() == 0UL ) {
5042 template<
typename VT1
5044 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5045 smpAssign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5055 const ResultType tmp( rhs );
5074 template<
typename VT1
5076 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5077 smpAddAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5083 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5084 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5086 if( right.rows() == 0UL || right.columns() == 0UL ) {
5120 template<
typename VT1
5122 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5123 smpSubAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5129 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5130 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5132 if( right.rows() == 0UL || right.columns() == 0UL ) {
5166 template<
typename VT1
5168 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5169 smpMultAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5179 const ResultType tmp( rhs );
5242 template<
typename T1
5244 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
5249 if( (~vec).
size() != (~mat).
rows() ) {
5268 template<
typename VT,
typename MT >
5285 template<
typename VT,
typename MT >
5287 :
public IsTrue< And< IsAligned<VT>, IsAligned<MT> >::value >
5303 template<
typename VT,
typename MT,
bool AF >
5308 typedef typename MultExprTrait< typename SubvectorExprTrait<const VT,AF>::Type
5309 ,
typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:340
#define BLAZE_THROW_INVALID_ARGUMENT(MESSAGE)
Macro for the emission of a std::invalid_argument exceptionThis macro encapsulates the default way of...
Definition: Exception.h:187
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:148
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:7820
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:328
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:252
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:207
Header file for the IsDiagonal type trait.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:208
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the IsSame and IsStrictlySame type traits.
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:507
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2588
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:259
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix)
Returns the current number of rows of the matrix.
Definition: Matrix.h:308
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:205
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:308
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:382
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:204
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:721
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the RequiresEvaluation type trait.
System settings for performance optimizations.
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:217
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:239
Header file for the VecScalarMultExpr base class.
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:124
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:127
Header file for the IsComplexDouble type trait.
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:214
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:261
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:253
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:201
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: ColumnMajorMatrix.h:79
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:203
Header file for the IsMatMatMultExpr type trait class.
#define BLAZE_THROW_OUT_OF_RANGE(MESSAGE)
Macro for the emission of a std::out_of_range exceptionThis macro encapsulates the default way of Bla...
Definition: Exception.h:331
Header file for the Columns type trait.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:372
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:199
Header file for the IsAligned type trait.
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraint on the data type.
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:211
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:126
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2586
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:202
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
System settings for the BLAS mode.
ReturnType at(size_t index) const
Checked access to the vector elements.
Definition: TDVecTDMatMultExpr.h:295
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the SubmatrixExprTrait class template.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:1232
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:125
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:138
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:383
Header file for BLAS triangular matrix/vector multiplication functions (trmv)
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
const bool useOptimizedKernels
Configuration switch for optimized kernels.This configuration switch enables/disables all optimized c...
Definition: Optimizations.h:84
Header file for the reset shim.
Constraint on the data type.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
Header file for the TVecMatMultExpr base class.
Constraints on the storage order of matrix types.
Header file for the HasMutableDataAccess type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:362
BLAZE_ALWAYS_INLINE int16_t sum(const simd_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Header file for all intrinsic functionality.
Header file for BLAS general matrix/vector multiplication functions (gemv)
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:200
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:166
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:128
Header file for the IsComputation type trait class.
Header file for the IsBuiltin type trait.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:258
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
Header file for the IsComplexFloat type trait.
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2583
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: RowVector.h:79
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the IsUpper type trait.
Header file for exception macros.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:129
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:352
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:318
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Constraint on the transpose flag of vector types.
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.