35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
110 template<
typename VT
112 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
113 ,
private TVecMatMultExpr
114 ,
private Computation
143 template<
typename T1 >
144 struct UseSMPAssign {
145 enum { value = ( evaluateVector || evaluateMatrix ) };
156 template<
typename T1,
typename T2,
typename T3 >
157 struct UseSinglePrecisionKernel {
159 HasMutableDataAccess<T1>::value &&
160 HasConstDataAccess<T2>::value &&
161 HasConstDataAccess<T3>::value &&
162 !IsDiagonal<T3>::value &&
163 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 IsFloat<typename T1::ElementType>::value &&
165 IsFloat<typename T2::ElementType>::value &&
166 IsFloat<typename T3::ElementType>::value };
177 template<
typename T1,
typename T2,
typename T3 >
178 struct UseDoublePrecisionKernel {
180 HasMutableDataAccess<T1>::value &&
181 HasConstDataAccess<T2>::value &&
182 HasConstDataAccess<T3>::value &&
183 !IsDiagonal<T3>::value &&
184 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
185 IsDouble<typename T1::ElementType>::value &&
186 IsDouble<typename T2::ElementType>::value &&
187 IsDouble<typename T3::ElementType>::value };
198 template<
typename T1,
typename T2,
typename T3 >
199 struct UseSinglePrecisionComplexKernel {
200 typedef complex<float> Type;
202 HasMutableDataAccess<T1>::value &&
203 HasConstDataAccess<T2>::value &&
204 HasConstDataAccess<T3>::value &&
205 !IsDiagonal<T3>::value &&
206 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
207 IsSame<typename T1::ElementType,Type>::value &&
208 IsSame<typename T2::ElementType,Type>::value &&
209 IsSame<typename T3::ElementType,Type>::value };
220 template<
typename T1,
typename T2,
typename T3 >
221 struct UseDoublePrecisionComplexKernel {
222 typedef complex<double> Type;
224 HasMutableDataAccess<T1>::value &&
225 HasConstDataAccess<T2>::value &&
226 HasConstDataAccess<T3>::value &&
227 !IsDiagonal<T3>::value &&
228 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
229 IsSame<typename T1::ElementType,Type>::value &&
230 IsSame<typename T2::ElementType,Type>::value &&
231 IsSame<typename T3::ElementType,Type>::value };
241 template<
typename T1,
typename T2,
typename T3 >
242 struct UseDefaultKernel {
243 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
244 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
245 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
246 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
257 template<
typename T1,
typename T2,
typename T3 >
258 struct UseVectorizedDefaultKernel {
259 enum { value = !IsDiagonal<T3>::value &&
260 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
261 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
262 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
263 IntrinsicTrait<typename T1::ElementType>::addition &&
264 IntrinsicTrait<typename T1::ElementType>::multiplication };
295 VT::vectorizable && MT::vectorizable &&
301 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
302 !evaluateMatrix && MT::smpAssignable };
334 return vec_[index] *
mat_(index,index);
344 const size_t inum( iend - ibegin );
345 const size_t ipos( ibegin + ( ( inum - 1UL ) &
size_t(-2) ) + 1UL );
347 ElementType res(
vec_[ibegin] *
mat_(ibegin,index) );
349 for(
size_t i=ibegin+1UL; i<ipos; i+=2UL ) {
353 res +=
vec_[ipos] *
mat_(ipos,index);
366 return mat_.columns();
396 template<
typename T >
398 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
408 template<
typename T >
410 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
420 return vec_.isAligned() &&
mat_.isAligned();
456 template<
typename VT1 >
463 if( rhs.mat_.rows() == 0UL ) {
467 else if( rhs.mat_.columns() == 0UL ) {
471 LT x(
serial( rhs.vec_ ) );
472 RT A(
serial( rhs.mat_ ) );
479 TDVecTDMatMultExpr::selectAssignKernel( ~lhs, x, A );
495 template<
typename VT1
498 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
503 selectSmallAssignKernel( y, x, A );
505 selectBlasAssignKernel( y, x, A );
524 template<
typename VT1
527 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
548 template<
typename VT1
551 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
552 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
554 selectDefaultAssignKernel( y, x, A );
573 template<
typename VT1
576 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
577 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
579 typedef IntrinsicTrait<ElementType> IT;
581 const size_t M( A.rows() );
582 const size_t N( A.columns() );
586 for( ; (j+8UL) <= N; j+=8UL )
588 const size_t ibegin( ( IsLower<MT1>::value )
589 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
591 const size_t iend( ( IsUpper<MT1>::value )
592 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
596 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
598 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
599 const IntrinsicType x1( x.load(i) );
600 xmm1 = xmm1 + x1 * A.load(i,j );
601 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
602 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
603 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
604 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
605 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
606 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
607 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
611 y[j+1UL] =
sum( xmm2 );
612 y[j+2UL] =
sum( xmm3 );
613 y[j+3UL] =
sum( xmm4 );
614 y[j+4UL] =
sum( xmm5 );
615 y[j+5UL] =
sum( xmm6 );
616 y[j+6UL] =
sum( xmm7 );
617 y[j+7UL] =
sum( xmm8 );
620 for( ; (j+4UL) <= N; j+=4UL )
622 const size_t ibegin( ( IsLower<MT1>::value )
623 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
625 const size_t iend( ( IsUpper<MT1>::value )
626 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
630 IntrinsicType xmm1, xmm2, xmm3, xmm4;
632 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
633 const IntrinsicType x1( x.load(i) );
634 xmm1 = xmm1 + x1 * A.load(i,j );
635 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
636 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
637 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
641 y[j+1UL] =
sum( xmm2 );
642 y[j+2UL] =
sum( xmm3 );
643 y[j+3UL] =
sum( xmm4 );
646 for( ; (j+3UL) <= N; j+=3UL )
648 const size_t ibegin( ( IsLower<MT1>::value )
649 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
651 const size_t iend( ( IsUpper<MT1>::value )
652 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
656 IntrinsicType xmm1, xmm2, xmm3;
658 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
659 const IntrinsicType x1( x.load(i) );
660 xmm1 = xmm1 + x1 * A.load(i,j );
661 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
662 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
666 y[j+1UL] =
sum( xmm2 );
667 y[j+2UL] =
sum( xmm3 );
670 for( ; (j+2UL) <= N; j+=2UL )
672 const size_t ibegin( ( IsLower<MT1>::value )
673 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
675 const size_t iend( ( IsUpper<MT1>::value )
676 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
680 IntrinsicType xmm1, xmm2;
682 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
683 const IntrinsicType x1( x.load(i) );
684 xmm1 = xmm1 + x1 * A.load(i,j );
685 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
689 y[j+1UL] =
sum( xmm2 );
694 const size_t ibegin( ( IsLower<MT1>::value )
695 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
697 const size_t iend( ( IsUpper<MT1>::value )
698 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
704 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
705 xmm1 = xmm1 + x.load(i) * A.load(i,j);
728 template<
typename VT1
731 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
732 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
734 selectDefaultAssignKernel( y, x, A );
753 template<
typename VT1
756 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
757 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
759 typedef IntrinsicTrait<ElementType> IT;
761 const size_t M( A.rows() );
762 const size_t N( A.columns() );
768 for( ; (j+8UL) <= N; j+=8UL )
770 const size_t ibegin( ( IsLower<MT1>::value )
771 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
773 const size_t iend( ( IsUpper<MT1>::value )
774 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
784 const IntrinsicType x1( x.load(i ) );
785 const IntrinsicType x2( x.load(i1) );
786 const IntrinsicType x3( x.load(i2) );
787 const IntrinsicType x4( x.load(i3) );
788 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
789 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
790 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
791 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
792 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
793 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
794 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
795 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
800 const IntrinsicType x1( x.load(i ) );
801 const IntrinsicType x2( x.load(i1) );
802 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
803 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
804 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
805 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
806 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
807 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
808 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
809 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
813 const IntrinsicType x1( x.load(i) );
814 y[j ] +=
sum( x1 * A.load(i,j ) );
815 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
816 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
817 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
818 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
819 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
820 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
821 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
825 for( ; (j+4UL) <= N; j+=4UL )
827 const size_t ibegin( ( IsLower<MT1>::value )
828 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
830 const size_t iend( ( IsUpper<MT1>::value )
831 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
841 const IntrinsicType x1( x.load(i ) );
842 const IntrinsicType x2( x.load(i1) );
843 const IntrinsicType x3( x.load(i2) );
844 const IntrinsicType x4( x.load(i3) );
845 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
846 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
847 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
848 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
853 const IntrinsicType x1( x.load(i ) );
854 const IntrinsicType x2( x.load(i1) );
855 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
856 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
857 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
858 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
862 const IntrinsicType x1( x.load(i) );
863 y[j ] +=
sum( x1 * A.load(i,j ) );
864 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
865 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
866 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
870 for( ; (j+2UL) <= N; j+=2UL )
872 const size_t ibegin( ( IsLower<MT1>::value )
873 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
875 const size_t iend( ( IsUpper<MT1>::value )
876 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
886 const IntrinsicType x1( x.load(i ) );
887 const IntrinsicType x2( x.load(i1) );
888 const IntrinsicType x3( x.load(i2) );
889 const IntrinsicType x4( x.load(i3) );
890 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
891 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
896 const IntrinsicType x1( x.load(i ) );
897 const IntrinsicType x2( x.load(i1) );
898 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
899 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
903 const IntrinsicType x1( x.load(i) );
904 y[j ] +=
sum( x1 * A.load(i,j ) );
905 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
911 const size_t ibegin( ( IsLower<MT1>::value )
912 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
914 const size_t iend( ( IsUpper<MT1>::value )
915 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
925 const IntrinsicType x1( x.load(i ) );
926 const IntrinsicType x2( x.load(i1) );
927 const IntrinsicType x3( x.load(i2) );
928 const IntrinsicType x4( x.load(i3) );
929 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
934 const IntrinsicType x1( x.load(i ) );
935 const IntrinsicType x2( x.load(i1) );
936 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
940 const IntrinsicType x1( x.load(i) );
941 y[j] +=
sum( x1 * A.load(i,j) );
962 template<
typename VT1
965 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
966 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
968 selectLargeAssignKernel( y, x, A );
988 template<
typename VT1
991 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
992 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
994 if( IsTriangular<MT1>::value ) {
996 strmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
999 sgemv( y, x, A, 1.0F, 0.0F );
1021 template<
typename VT1
1024 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1025 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1027 if( IsTriangular<MT1>::value ) {
1029 dtrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1032 dgemv( y, x, A, 1.0, 0.0 );
1054 template<
typename VT1
1057 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1058 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1060 if( IsTriangular<MT1>::value ) {
1062 ctrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1065 cgemv( y, x, A, complex<float>( 1.0F, 0.0F ), complex<float>( 0.0F, 0.0F ) );
1087 template<
typename VT1
1090 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1091 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1093 if( IsTriangular<MT1>::value ) {
1095 ztrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1098 zgemv( y, x, A, complex<double>( 1.0, 0.0 ), complex<double>( 0.0, 0.0 ) );
1118 template<
typename VT1 >
1129 const ResultType tmp(
serial( rhs ) );
1148 template<
typename VT1 >
1155 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1159 LT x(
serial( rhs.vec_ ) );
1160 RT A(
serial( rhs.mat_ ) );
1167 TDVecTDMatMultExpr::selectAddAssignKernel( ~lhs, x, A );
1183 template<
typename VT1
1186 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1188 if( ( IsDiagonal<MT1>::value ) ||
1189 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1191 selectSmallAddAssignKernel( y, x, A );
1193 selectBlasAddAssignKernel( y, x, A );
1212 template<
typename VT1
1215 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1217 y.addAssign( x * A );
1236 template<
typename VT1
1239 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1240 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1242 selectDefaultAddAssignKernel( y, x, A );
1262 template<
typename VT1
1265 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1266 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1268 typedef IntrinsicTrait<ElementType> IT;
1270 const size_t M( A.rows() );
1271 const size_t N( A.columns() );
1275 for( ; (j+8UL) <= N; j+=8UL )
1277 const size_t ibegin( ( IsLower<MT1>::value )
1278 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1280 const size_t iend( ( IsUpper<MT1>::value )
1281 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1285 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1287 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1288 const IntrinsicType x1( x.load(i) );
1289 xmm1 = xmm1 + x1 * A.load(i,j );
1290 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1291 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1292 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1293 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1294 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1295 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1296 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1299 y[j ] +=
sum( xmm1 );
1300 y[j+1UL] +=
sum( xmm2 );
1301 y[j+2UL] +=
sum( xmm3 );
1302 y[j+3UL] +=
sum( xmm4 );
1303 y[j+4UL] +=
sum( xmm5 );
1304 y[j+5UL] +=
sum( xmm6 );
1305 y[j+6UL] +=
sum( xmm7 );
1306 y[j+7UL] +=
sum( xmm8 );
1309 for( ; (j+4UL) <= N; j+=4UL )
1311 const size_t ibegin( ( IsLower<MT1>::value )
1312 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1314 const size_t iend( ( IsUpper<MT1>::value )
1315 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1319 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1321 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1322 const IntrinsicType x1( x.load(i) );
1323 xmm1 = xmm1 + x1 * A.load(i,j );
1324 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1325 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1326 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1329 y[j ] +=
sum( xmm1 );
1330 y[j+1UL] +=
sum( xmm2 );
1331 y[j+2UL] +=
sum( xmm3 );
1332 y[j+3UL] +=
sum( xmm4 );
1335 for( ; (j+3UL) <= N; j+=3UL )
1337 const size_t ibegin( ( IsLower<MT1>::value )
1338 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1340 const size_t iend( ( IsUpper<MT1>::value )
1341 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
1345 IntrinsicType xmm1, xmm2, xmm3;
1347 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1348 const IntrinsicType x1( x.load(i) );
1349 xmm1 = xmm1 + x1 * A.load(i,j );
1350 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1351 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1354 y[j ] +=
sum( xmm1 );
1355 y[j+1UL] +=
sum( xmm2 );
1356 y[j+2UL] +=
sum( xmm3 );
1359 for( ; (j+2UL) <= N; j+=2UL )
1361 const size_t ibegin( ( IsLower<MT1>::value )
1362 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1364 const size_t iend( ( IsUpper<MT1>::value )
1365 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1369 IntrinsicType xmm1, xmm2;
1371 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1372 const IntrinsicType x1( x.load(i) );
1373 xmm1 = xmm1 + x1 * A.load(i,j );
1374 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1377 y[j ] +=
sum( xmm1 );
1378 y[j+1UL] +=
sum( xmm2 );
1383 const size_t ibegin( ( IsLower<MT1>::value )
1384 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1386 const size_t iend( ( IsUpper<MT1>::value )
1387 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1393 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1394 xmm1 = xmm1 + A.load(i,j) * x.load(i);
1397 y[j] +=
sum( xmm1 );
1417 template<
typename VT1
1420 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1421 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1423 selectDefaultAddAssignKernel( y, x, A );
1443 template<
typename VT1
1446 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1447 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1449 typedef IntrinsicTrait<ElementType> IT;
1451 const size_t M( A.rows() );
1452 const size_t N( A.columns() );
1456 for( ; (j+8UL) <= N; j+=8UL )
1458 const size_t ibegin( ( IsLower<MT1>::value )
1459 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1461 const size_t iend( ( IsUpper<MT1>::value )
1462 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1472 const IntrinsicType x1( x.load(i ) );
1473 const IntrinsicType x2( x.load(i1) );
1474 const IntrinsicType x3( x.load(i2) );
1475 const IntrinsicType x4( x.load(i3) );
1476 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1477 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1478 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1479 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1480 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
1481 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
1482 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
1483 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
1488 const IntrinsicType x1( x.load(i ) );
1489 const IntrinsicType x2( x.load(i1) );
1490 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1491 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1492 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1493 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1494 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
1495 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
1496 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
1497 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
1501 const IntrinsicType x1( x.load(i) );
1502 y[j ] +=
sum( x1 * A.load(i,j ) );
1503 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1504 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1505 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1506 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
1507 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
1508 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
1509 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
1513 for( ; (j+4UL) <= N; j+=4UL )
1515 const size_t ibegin( ( IsLower<MT1>::value )
1516 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1518 const size_t iend( ( IsUpper<MT1>::value )
1519 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1529 const IntrinsicType x1( x.load(i ) );
1530 const IntrinsicType x2( x.load(i1) );
1531 const IntrinsicType x3( x.load(i2) );
1532 const IntrinsicType x4( x.load(i3) );
1533 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1534 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1535 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
1536 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
1541 const IntrinsicType x1( x.load(i ) );
1542 const IntrinsicType x2( x.load(i1) );
1543 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1544 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1545 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
1546 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
1550 const IntrinsicType x1( x.load(i) );
1551 y[j ] +=
sum( x1 * A.load(i,j ) );
1552 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1553 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
1554 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
1558 for( ; (j+2UL) <= N; j+=2UL )
1560 const size_t ibegin( ( IsLower<MT1>::value )
1561 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1563 const size_t iend( ( IsUpper<MT1>::value )
1564 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
1574 const IntrinsicType x1( x.load(i ) );
1575 const IntrinsicType x2( x.load(i1) );
1576 const IntrinsicType x3( x.load(i2) );
1577 const IntrinsicType x4( x.load(i3) );
1578 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
1579 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
1584 const IntrinsicType x1( x.load(i ) );
1585 const IntrinsicType x2( x.load(i1) );
1586 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
1587 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
1591 const IntrinsicType x1( x.load(i) );
1592 y[j ] +=
sum( x1 * A.load(i,j ) );
1593 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
1599 const size_t ibegin( ( IsLower<MT1>::value )
1600 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1602 const size_t iend( ( IsUpper<MT1>::value )
1603 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
1613 const IntrinsicType x1( x.load(i ) );
1614 const IntrinsicType x2( x.load(i1) );
1615 const IntrinsicType x3( x.load(i2) );
1616 const IntrinsicType x4( x.load(i3) );
1617 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
1622 const IntrinsicType x1( x.load(i ) );
1623 const IntrinsicType x2( x.load(i1) );
1624 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
1628 const IntrinsicType x1( x.load(i) );
1629 y[j] +=
sum( x1 * A.load(i,j) );
1650 template<
typename VT1
1653 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1654 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1656 selectLargeAddAssignKernel( y, x, A );
1676 template<
typename VT1
1679 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1680 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1682 if( IsTriangular<MT1>::value ) {
1684 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1688 sgemv( y, x, A, 1.0F, 1.0F );
1710 template<
typename VT1
1713 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1714 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1716 if( IsTriangular<MT1>::value ) {
1718 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1722 dgemv( y, x, A, 1.0, 1.0 );
1744 template<
typename VT1
1747 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1748 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1750 if( IsTriangular<MT1>::value ) {
1752 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1756 cgemv( y, x, A, complex<float>( 1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
1778 template<
typename VT1
1781 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1782 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1784 if( IsTriangular<MT1>::value ) {
1786 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
1790 zgemv( y, x, A, complex<double>( 1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
1814 template<
typename VT1 >
1821 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1825 LT x(
serial( rhs.vec_ ) );
1826 RT A(
serial( rhs.mat_ ) );
1833 TDVecTDMatMultExpr::selectSubAssignKernel( ~lhs, x, A );
1849 template<
typename VT1
1852 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1854 if( ( IsDiagonal<MT1>::value ) ||
1855 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1857 selectSmallSubAssignKernel( y, x, A );
1859 selectBlasSubAssignKernel( y, x, A );
1878 template<
typename VT1
1881 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1883 y.subAssign( x * A );
1902 template<
typename VT1
1905 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1906 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1908 selectDefaultSubAssignKernel( y, x, A );
1928 template<
typename VT1
1931 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1932 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1934 typedef IntrinsicTrait<ElementType> IT;
1936 const size_t M( A.rows() );
1937 const size_t N( A.columns() );
1941 for( ; (j+8UL) <= N; j+=8UL )
1943 const size_t ibegin( ( IsLower<MT1>::value )
1944 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1946 const size_t iend( ( IsUpper<MT1>::value )
1947 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
1951 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1953 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1954 const IntrinsicType x1( x.load(i) );
1955 xmm1 = xmm1 + x1 * A.load(i,j );
1956 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1957 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1958 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1959 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
1960 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
1961 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
1962 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
1965 y[j ] -=
sum( xmm1 );
1966 y[j+1UL] -=
sum( xmm2 );
1967 y[j+2UL] -=
sum( xmm3 );
1968 y[j+3UL] -=
sum( xmm4 );
1969 y[j+4UL] -=
sum( xmm5 );
1970 y[j+5UL] -=
sum( xmm6 );
1971 y[j+6UL] -=
sum( xmm7 );
1972 y[j+7UL] -=
sum( xmm8 );
1975 for( ; (j+4UL) <= N; j+=4UL )
1977 const size_t ibegin( ( IsLower<MT1>::value )
1978 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
1980 const size_t iend( ( IsUpper<MT1>::value )
1981 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
1985 IntrinsicType xmm1, xmm2, xmm3, xmm4;
1987 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
1988 const IntrinsicType x1( x.load(i) );
1989 xmm1 = xmm1 + x1 * A.load(i,j );
1990 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
1991 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
1992 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
1995 y[j ] -=
sum( xmm1 );
1996 y[j+1UL] -=
sum( xmm2 );
1997 y[j+2UL] -=
sum( xmm3 );
1998 y[j+3UL] -=
sum( xmm4 );
2001 for( ; (j+3UL) <= N; j+=3UL )
2003 const size_t ibegin( ( IsLower<MT1>::value )
2004 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2006 const size_t iend( ( IsUpper<MT1>::value )
2007 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
2011 IntrinsicType xmm1, xmm2, xmm3;
2013 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
2014 const IntrinsicType x1( x.load(i) );
2015 xmm1 = xmm1 + x1 * A.load(i,j );
2016 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2017 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
2020 y[j ] -=
sum( xmm1 );
2021 y[j+1UL] -=
sum( xmm2 );
2022 y[j+2UL] -=
sum( xmm3 );
2025 for( ; (j+2UL) <= N; j+=2UL )
2027 const size_t ibegin( ( IsLower<MT1>::value )
2028 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2030 const size_t iend( ( IsUpper<MT1>::value )
2031 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2035 IntrinsicType xmm1, xmm2;
2037 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
2038 const IntrinsicType x1( x.load(i) );
2039 xmm1 = xmm1 + x1 * A.load(i,j );
2040 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
2043 y[j ] -=
sum( xmm1 );
2044 y[j+1UL] -=
sum( xmm2 );
2049 const size_t ibegin( ( IsLower<MT1>::value )
2050 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2052 const size_t iend( ( IsUpper<MT1>::value )
2053 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2059 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
2060 xmm1 = xmm1 + A.load(i,j) * x.load(i);
2063 y[j] -=
sum( xmm1 );
2083 template<
typename VT1
2086 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2087 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2089 selectDefaultSubAssignKernel( y, x, A );
2109 template<
typename VT1
2112 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
2113 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2115 typedef IntrinsicTrait<ElementType> IT;
2117 const size_t M( A.rows() );
2118 const size_t N( A.columns() );
2122 for( ; (j+8UL) <= N; j+=8UL )
2124 const size_t ibegin( ( IsLower<MT1>::value )
2125 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2127 const size_t iend( ( IsUpper<MT1>::value )
2128 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
2138 const IntrinsicType x1( x.load(i ) );
2139 const IntrinsicType x2( x.load(i1) );
2140 const IntrinsicType x3( x.load(i2) );
2141 const IntrinsicType x4( x.load(i3) );
2142 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2143 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2144 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2145 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2146 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
2147 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
2148 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
2149 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
2154 const IntrinsicType x1( x.load(i ) );
2155 const IntrinsicType x2( x.load(i1) );
2156 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2157 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2158 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2159 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2160 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
2161 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
2162 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
2163 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
2167 const IntrinsicType x1( x.load(i) );
2168 y[j ] -=
sum( x1 * A.load(i,j ) );
2169 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2170 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2171 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2172 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) );
2173 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) );
2174 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) );
2175 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) );
2179 for( ; (j+4UL) <= N; j+=4UL )
2181 const size_t ibegin( ( IsLower<MT1>::value )
2182 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2184 const size_t iend( ( IsUpper<MT1>::value )
2185 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
2195 const IntrinsicType x1( x.load(i ) );
2196 const IntrinsicType x2( x.load(i1) );
2197 const IntrinsicType x3( x.load(i2) );
2198 const IntrinsicType x4( x.load(i3) );
2199 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2200 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2201 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
2202 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
2207 const IntrinsicType x1( x.load(i ) );
2208 const IntrinsicType x2( x.load(i1) );
2209 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2210 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2211 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
2212 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
2216 const IntrinsicType x1( x.load(i) );
2217 y[j ] -=
sum( x1 * A.load(i,j ) );
2218 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2219 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) );
2220 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) );
2224 for( ; (j+2UL) <= N; j+=2UL )
2226 const size_t ibegin( ( IsLower<MT1>::value )
2227 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2229 const size_t iend( ( IsUpper<MT1>::value )
2230 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
2240 const IntrinsicType x1( x.load(i ) );
2241 const IntrinsicType x2( x.load(i1) );
2242 const IntrinsicType x3( x.load(i2) );
2243 const IntrinsicType x4( x.load(i3) );
2244 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
2245 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
2250 const IntrinsicType x1( x.load(i ) );
2251 const IntrinsicType x2( x.load(i1) );
2252 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
2253 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
2257 const IntrinsicType x1( x.load(i) );
2258 y[j ] -=
sum( x1 * A.load(i,j ) );
2259 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) );
2265 const size_t ibegin( ( IsLower<MT1>::value )
2266 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
2268 const size_t iend( ( IsUpper<MT1>::value )
2269 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
2279 const IntrinsicType x1( x.load(i ) );
2280 const IntrinsicType x2( x.load(i1) );
2281 const IntrinsicType x3( x.load(i2) );
2282 const IntrinsicType x4( x.load(i3) );
2283 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
2288 const IntrinsicType x1( x.load(i ) );
2289 const IntrinsicType x2( x.load(i1) );
2290 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
2294 const IntrinsicType x1( x.load(i) );
2295 y[j] -=
sum( x1 * A.load(i,j) );
2316 template<
typename VT1
2319 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
2320 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2322 selectLargeSubAssignKernel( y, x, A );
2342 template<
typename VT1
2345 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
2346 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2348 if( IsTriangular<MT1>::value ) {
2350 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2354 sgemv( y, x, A, -1.0F, 1.0F );
2376 template<
typename VT1
2379 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
2380 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2382 if( IsTriangular<MT1>::value ) {
2384 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2388 dgemv( y, x, A, -1.0, 1.0 );
2410 template<
typename VT1
2413 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2414 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2416 if( IsTriangular<MT1>::value ) {
2418 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2422 cgemv( y, x, A, complex<float>( -1.0F, 0.0F ), complex<float>( 1.0F, 0.0F ) );
2444 template<
typename VT1
2447 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2448 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
2450 if( IsTriangular<MT1>::value ) {
2452 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
2456 zgemv( y, x, A, complex<double>( -1.0, 0.0 ), complex<double>( 1.0, 0.0 ) );
2480 template<
typename VT1 >
2491 const ResultType tmp(
serial( rhs ) );
2516 template<
typename VT1 >
2517 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2524 if( rhs.mat_.rows() == 0UL ) {
2528 else if( rhs.mat_.columns() == 0UL ) {
2560 template<
typename VT1 >
2561 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2572 const ResultType tmp( rhs );
2593 template<
typename VT1 >
2594 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2601 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2637 template<
typename VT1 >
2638 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2645 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
2681 template<
typename VT1 >
2682 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
2693 const ResultType tmp( rhs );
2732 template<
typename VT
2736 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
2737 ,
private VecScalarMultExpr
2738 ,
private Computation
2742 typedef TDVecTDMatMultExpr<VT,MT> VMM;
2754 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
2759 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
2760 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
2768 template<
typename T1 >
2769 struct UseSMPAssign {
2770 enum { value = T1::smpAssignable && ( evaluateVector || evaluateMatrix ) };
2779 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2780 struct UseSinglePrecisionKernel {
2782 HasMutableDataAccess<T1>::value &&
2783 HasConstDataAccess<T2>::value &&
2784 HasConstDataAccess<T3>::value &&
2785 !IsDiagonal<T3>::value &&
2786 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2787 IsFloat<typename T1::ElementType>::value &&
2788 IsFloat<typename T2::ElementType>::value &&
2789 IsFloat<typename T3::ElementType>::value &&
2790 !IsComplex<T4>::value };
2799 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2800 struct UseDoublePrecisionKernel {
2802 HasMutableDataAccess<T1>::value &&
2803 HasConstDataAccess<T2>::value &&
2804 HasConstDataAccess<T3>::value &&
2805 !IsDiagonal<T3>::value &&
2806 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2807 IsDouble<typename T1::ElementType>::value &&
2808 IsDouble<typename T2::ElementType>::value &&
2809 IsDouble<typename T3::ElementType>::value &&
2810 !IsComplex<T4>::value };
2819 template<
typename T1,
typename T2,
typename T3 >
2820 struct UseSinglePrecisionComplexKernel {
2821 typedef complex<float> Type;
2823 HasMutableDataAccess<T1>::value &&
2824 HasConstDataAccess<T2>::value &&
2825 HasConstDataAccess<T3>::value &&
2826 !IsDiagonal<T3>::value &&
2827 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2828 IsSame<typename T1::ElementType,Type>::value &&
2829 IsSame<typename T2::ElementType,Type>::value &&
2830 IsSame<typename T3::ElementType,Type>::value };
2839 template<
typename T1,
typename T2,
typename T3 >
2840 struct UseDoublePrecisionComplexKernel {
2841 typedef complex<double> Type;
2843 HasMutableDataAccess<T1>::value &&
2844 HasConstDataAccess<T2>::value &&
2845 HasConstDataAccess<T3>::value &&
2846 !IsDiagonal<T3>::value &&
2847 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2848 IsSame<typename T1::ElementType,Type>::value &&
2849 IsSame<typename T2::ElementType,Type>::value &&
2850 IsSame<typename T3::ElementType,Type>::value };
2858 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2859 struct UseDefaultKernel {
2860 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
2861 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2862 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2863 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2872 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2873 struct UseVectorizedDefaultKernel {
2874 enum { value = !IsDiagonal<T3>::value &&
2875 T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2876 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2877 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2878 IsSame<typename T1::ElementType,T4>::value &&
2879 IntrinsicTrait<typename T1::ElementType>::addition &&
2880 IntrinsicTrait<typename T1::ElementType>::multiplication };
2886 typedef DVecScalarMultExpr<VMM,ST,true>
This;
2887 typedef typename MultTrait<RES,ST>::Type
ResultType;
2890 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2895 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
2901 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
LT;
2904 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
RT;
2909 enum { vectorizable = !IsDiagonal<MT>::value &&
2910 VT::vectorizable && MT::vectorizable &&
2911 IsSame<VET,MET>::value &&
2912 IsSame<VET,ST>::value &&
2913 IntrinsicTrait<VET>::addition &&
2914 IntrinsicTrait<VET>::multiplication };
2917 enum { smpAssignable = !evaluateVector && VT::smpAssignable &&
2918 !evaluateMatrix && MT::smpAssignable };
2927 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
2939 inline ReturnType
operator[](
size_t index )
const {
2941 return vector_[index] * scalar_;
2950 inline size_t size()
const {
2951 return vector_.size();
2981 template<
typename T >
2982 inline bool canAlias(
const T* alias )
const {
2983 return vector_.canAlias( alias );
2993 template<
typename T >
2994 inline bool isAliased(
const T* alias )
const {
2995 return vector_.isAliased( alias );
3005 return vector_.isAligned();
3015 typename VMM::RightOperand A( vector_.rightOperand() );
3017 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3025 LeftOperand vector_;
3026 RightOperand scalar_;
3041 template<
typename VT1
3043 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3049 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3050 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3052 if( right.rows() == 0UL ) {
3056 else if( right.columns() == 0UL ) {
3068 DVecScalarMultExpr::selectAssignKernel( ~lhs, x, A, rhs.scalar_ );
3083 template<
typename VT1
3087 static inline void selectAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3089 if( ( IsDiagonal<MT1>::value ) ||
3090 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3092 selectSmallAssignKernel( y, x, A, scalar );
3094 selectBlasAssignKernel( y, x, A, scalar );
3112 template<
typename VT1
3116 static inline void selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3118 y.assign( x * A * scalar );
3136 template<
typename VT1
3140 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3141 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3143 selectDefaultAssignKernel( y, x, A, scalar );
3162 template<
typename VT1
3166 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3167 selectSmallAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3169 typedef IntrinsicTrait<ElementType> IT;
3171 const size_t M( A.rows() );
3172 const size_t N( A.columns() );
3176 for( ; (j+8UL) <= N; j+=8UL )
3178 const size_t ibegin( ( IsLower<MT1>::value )
3179 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3181 const size_t iend( ( IsUpper<MT1>::value )
3182 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3186 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3188 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3189 const IntrinsicType x1( x.load(i) );
3190 xmm1 = xmm1 + x1 * A.load(i,j );
3191 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3192 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3193 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3194 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3195 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3196 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3197 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3200 y[j ] =
sum( xmm1 ) * scalar;
3201 y[j+1UL] =
sum( xmm2 ) * scalar;
3202 y[j+2UL] =
sum( xmm3 ) * scalar;
3203 y[j+3UL] =
sum( xmm4 ) * scalar;
3204 y[j+4UL] =
sum( xmm5 ) * scalar;
3205 y[j+5UL] =
sum( xmm6 ) * scalar;
3206 y[j+6UL] =
sum( xmm7 ) * scalar;
3207 y[j+7UL] =
sum( xmm8 ) * scalar;
3210 for( ; (j+4UL) <= N; j+=4UL )
3212 const size_t ibegin( ( IsLower<MT1>::value )
3213 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3215 const size_t iend( ( IsUpper<MT1>::value )
3216 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3220 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3222 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3223 const IntrinsicType x1( x.load(i) );
3224 xmm1 = xmm1 + x1 * A.load(i,j );
3225 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3226 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3227 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3230 y[j ] =
sum( xmm1 ) * scalar;
3231 y[j+1UL] =
sum( xmm2 ) * scalar;
3232 y[j+2UL] =
sum( xmm3 ) * scalar;
3233 y[j+3UL] =
sum( xmm4 ) * scalar;
3236 for( ; (j+3UL) <= N; j+=3UL )
3238 const size_t ibegin( ( IsLower<MT1>::value )
3239 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3241 const size_t iend( ( IsUpper<MT1>::value )
3242 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3246 IntrinsicType xmm1, xmm2, xmm3;
3248 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3249 const IntrinsicType x1( x.load(i) );
3250 xmm1 = xmm1 + x1 * A.load(i,j );
3251 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3252 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3255 y[j ] =
sum( xmm1 ) * scalar;
3256 y[j+1UL] =
sum( xmm2 ) * scalar;
3257 y[j+2UL] =
sum( xmm3 ) * scalar;
3260 for( ; (j+2UL) <= N; j+=2UL )
3262 const size_t ibegin( ( IsLower<MT1>::value )
3263 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3265 const size_t iend( ( IsUpper<MT1>::value )
3266 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3270 IntrinsicType xmm1, xmm2;
3272 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3273 const IntrinsicType x1( x.load(i) );
3274 xmm1 = xmm1 + x1 * A.load(i,j );
3275 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3278 y[j ] =
sum( xmm1 ) * scalar;
3279 y[j+1UL] =
sum( xmm2 ) * scalar;
3284 const size_t ibegin( ( IsLower<MT1>::value )
3285 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3287 const size_t iend( ( IsUpper<MT1>::value )
3288 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3294 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3295 xmm1 = xmm1 + A.load(i,j) * x.load(i);
3298 y[j] =
sum( xmm1 ) * scalar;
3317 template<
typename VT1
3321 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3322 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3324 selectDefaultAssignKernel( y, x, A, scalar );
3343 template<
typename VT1
3347 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3348 selectLargeAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3350 typedef IntrinsicTrait<ElementType> IT;
3352 const size_t M( A.rows() );
3353 const size_t N( A.columns() );
3359 for( ; (j+8UL) <= N; j+=8UL )
3361 const size_t ibegin( ( IsLower<MT1>::value )
3362 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3364 const size_t iend( ( IsUpper<MT1>::value )
3365 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3375 const IntrinsicType x1( x.load(i ) );
3376 const IntrinsicType x2( x.load(i1) );
3377 const IntrinsicType x3( x.load(i2) );
3378 const IntrinsicType x4( x.load(i3) );
3379 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3380 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3381 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3382 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3383 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) );
3384 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) );
3385 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) );
3386 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) );
3391 const IntrinsicType x1( x.load(i ) );
3392 const IntrinsicType x2( x.load(i1) );
3393 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3394 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3395 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3396 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3397 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) );
3398 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) );
3399 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) );
3400 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) );
3404 const IntrinsicType x1( x.load(i) );
3405 y[j ] +=
sum( x1 * A.load(i,j ) );
3406 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3407 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3408 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3409 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) );
3410 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) );
3411 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) );
3412 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) );
3425 for( ; (j+4UL) <= N; j+=4UL )
3427 const size_t ibegin( ( IsLower<MT1>::value )
3428 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3430 const size_t iend( ( IsUpper<MT1>::value )
3431 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3441 const IntrinsicType x1( x.load(i ) );
3442 const IntrinsicType x2( x.load(i1) );
3443 const IntrinsicType x3( x.load(i2) );
3444 const IntrinsicType x4( x.load(i3) );
3445 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3446 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3447 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) );
3448 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) );
3453 const IntrinsicType x1( x.load(i ) );
3454 const IntrinsicType x2( x.load(i1) );
3455 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3456 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3457 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) );
3458 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) );
3462 const IntrinsicType x1( x.load(i) );
3463 y[j ] +=
sum( x1 * A.load(i,j ) );
3464 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3465 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) );
3466 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) );
3475 for( ; (j+2UL) <= N; j+=2UL )
3477 const size_t ibegin( ( IsLower<MT1>::value )
3478 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3480 const size_t iend( ( IsUpper<MT1>::value )
3481 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3491 const IntrinsicType x1( x.load(i ) );
3492 const IntrinsicType x2( x.load(i1) );
3493 const IntrinsicType x3( x.load(i2) );
3494 const IntrinsicType x4( x.load(i3) );
3495 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) );
3496 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) );
3501 const IntrinsicType x1( x.load(i ) );
3502 const IntrinsicType x2( x.load(i1) );
3503 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) );
3504 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) );
3508 const IntrinsicType x1( x.load(i) );
3509 y[j ] +=
sum( x1 * A.load(i,j ) );
3510 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) );
3519 const size_t ibegin( ( IsLower<MT1>::value )
3520 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3522 const size_t iend( ( IsUpper<MT1>::value )
3523 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
3533 const IntrinsicType x1( x.load(i ) );
3534 const IntrinsicType x2( x.load(i1) );
3535 const IntrinsicType x3( x.load(i2) );
3536 const IntrinsicType x4( x.load(i3) );
3537 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) );
3542 const IntrinsicType x1( x.load(i ) );
3543 const IntrinsicType x2( x.load(i1) );
3544 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) );
3548 const IntrinsicType x1( x.load(i) );
3549 y[j] +=
sum( x1 * A.load(i,j) );
3570 template<
typename VT1
3574 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3575 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3577 selectLargeAssignKernel( y, x, A, scalar );
3596 template<
typename VT1
3600 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3601 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3603 if( IsTriangular<MT1>::value ) {
3605 strmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3608 sgemv( y, x, A, scalar, 0.0F );
3629 template<
typename VT1
3633 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
3634 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3636 if( IsTriangular<MT1>::value ) {
3638 dtrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3641 dgemv( y, x, A, scalar, 0.0 );
3663 template<
typename VT1
3667 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3668 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3670 if( IsTriangular<MT1>::value ) {
3672 ctrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3675 cgemv( y, x, A, complex<float>( scalar, 0.0F ), complex<float>( 0.0F, 0.0F ) );
3697 template<
typename VT1
3701 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
3702 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3704 if( IsTriangular<MT1>::value ) {
3706 ztrmv( y, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
3709 zgemv( y, x, A, complex<double>( scalar, 0.0 ), complex<double>( 0.0, 0.0 ) );
3727 template<
typename VT1
3729 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3739 const ResultType tmp(
serial( rhs ) );
3756 template<
typename VT1
3758 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
3764 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
3765 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
3767 if( right.rows() == 0UL || right.columns() == 0UL ) {
3779 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
3794 template<
typename VT1
3798 static inline void selectAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3800 if( ( IsDiagonal<MT1>::value ) ||
3801 ( IsComputation<MT>::value && !evaluateMatrix ) ||
3803 selectSmallAddAssignKernel( y, x, A, scalar );
3805 selectBlasAddAssignKernel( y, x, A, scalar );
3823 template<
typename VT1
3827 static inline void selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3829 y.addAssign( x * A * scalar );
3847 template<
typename VT1
3851 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3852 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3854 selectDefaultAddAssignKernel( y, x, A, scalar );
3873 template<
typename VT1
3877 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
3878 selectSmallAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
3880 typedef IntrinsicTrait<ElementType> IT;
3882 const size_t M( A.rows() );
3883 const size_t N( A.columns() );
3887 for( ; (j+8UL) <= N; j+=8UL )
3889 const size_t ibegin( ( IsLower<MT1>::value )
3890 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3892 const size_t iend( ( IsUpper<MT1>::value )
3893 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
3897 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3899 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3900 const IntrinsicType x1( x.load(i) );
3901 xmm1 = xmm1 + x1 * A.load(i,j );
3902 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3903 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3904 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3905 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
3906 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
3907 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
3908 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
3911 y[j ] +=
sum( xmm1 ) * scalar;
3912 y[j+1UL] +=
sum( xmm2 ) * scalar;
3913 y[j+2UL] +=
sum( xmm3 ) * scalar;
3914 y[j+3UL] +=
sum( xmm4 ) * scalar;
3915 y[j+4UL] +=
sum( xmm5 ) * scalar;
3916 y[j+5UL] +=
sum( xmm6 ) * scalar;
3917 y[j+6UL] +=
sum( xmm7 ) * scalar;
3918 y[j+7UL] +=
sum( xmm8 ) * scalar;
3921 for( ; (j+4UL) <= N; j+=4UL )
3923 const size_t ibegin( ( IsLower<MT1>::value )
3924 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3926 const size_t iend( ( IsUpper<MT1>::value )
3927 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
3931 IntrinsicType xmm1, xmm2, xmm3, xmm4;
3933 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3934 const IntrinsicType x1( x.load(i) );
3935 xmm1 = xmm1 + x1 * A.load(i,j );
3936 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3937 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3938 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
3941 y[j ] +=
sum( xmm1 ) * scalar;
3942 y[j+1UL] +=
sum( xmm2 ) * scalar;
3943 y[j+2UL] +=
sum( xmm3 ) * scalar;
3944 y[j+3UL] +=
sum( xmm4 ) * scalar;
3947 for( ; (j+3UL) <= N; j+=3UL )
3949 const size_t ibegin( ( IsLower<MT1>::value )
3950 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3952 const size_t iend( ( IsUpper<MT1>::value )
3953 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
3957 IntrinsicType xmm1, xmm2, xmm3;
3959 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3960 const IntrinsicType x1( x.load(i) );
3961 xmm1 = xmm1 + x1 * A.load(i,j );
3962 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3963 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
3966 y[j ] +=
sum( xmm1 ) * scalar;
3967 y[j+1UL] +=
sum( xmm2 ) * scalar;
3968 y[j+2UL] +=
sum( xmm3 ) * scalar;
3971 for( ; (j+2UL) <= N; j+=2UL )
3973 const size_t ibegin( ( IsLower<MT1>::value )
3974 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3976 const size_t iend( ( IsUpper<MT1>::value )
3977 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
3981 IntrinsicType xmm1, xmm2;
3983 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
3984 const IntrinsicType x1( x.load(i) );
3985 xmm1 = xmm1 + x1 * A.load(i,j );
3986 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
3989 y[j ] +=
sum( xmm1 ) * scalar;
3990 y[j+1UL] +=
sum( xmm2 ) * scalar;
3995 const size_t ibegin( ( IsLower<MT1>::value )
3996 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
3998 const size_t iend( ( IsUpper<MT1>::value )
3999 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4005 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4006 xmm1 = xmm1 + A.load(i,j) * x.load(i);
4009 y[j] +=
sum( xmm1 ) * scalar;
4028 template<
typename VT1
4032 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4033 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4035 selectDefaultAddAssignKernel( y, x, A, scalar );
4054 template<
typename VT1
4058 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4059 selectLargeAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4061 typedef IntrinsicTrait<ElementType> IT;
4063 const size_t M( A.rows() );
4064 const size_t N( A.columns() );
4068 for( ; (j+8UL) <= N; j+=8UL )
4070 const size_t ibegin( ( IsLower<MT1>::value )
4071 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4073 const size_t iend( ( IsUpper<MT1>::value )
4074 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4084 const IntrinsicType x1( x.load(i ) );
4085 const IntrinsicType x2( x.load(i1) );
4086 const IntrinsicType x3( x.load(i2) );
4087 const IntrinsicType x4( x.load(i3) );
4088 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4089 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4090 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4091 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4092 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4093 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4094 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4095 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4100 const IntrinsicType x1( x.load(i ) );
4101 const IntrinsicType x2( x.load(i1) );
4102 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4103 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4104 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4105 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4106 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4107 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4108 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4109 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4113 const IntrinsicType x1( x.load(i) );
4114 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4115 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4116 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4117 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4118 y[j+4UL] +=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4119 y[j+5UL] +=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4120 y[j+6UL] +=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4121 y[j+7UL] +=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4125 for( ; (j+4UL) <= N; j+=4UL )
4127 const size_t ibegin( ( IsLower<MT1>::value )
4128 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4130 const size_t iend( ( IsUpper<MT1>::value )
4131 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4141 const IntrinsicType x1( x.load(i ) );
4142 const IntrinsicType x2( x.load(i1) );
4143 const IntrinsicType x3( x.load(i2) );
4144 const IntrinsicType x4( x.load(i3) );
4145 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4146 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4147 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4148 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4153 const IntrinsicType x1( x.load(i ) );
4154 const IntrinsicType x2( x.load(i1) );
4155 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4156 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4157 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4158 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4162 const IntrinsicType x1( x.load(i) );
4163 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4164 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4165 y[j+2UL] +=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4166 y[j+3UL] +=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4170 for( ; (j+2UL) <= N; j+=2UL )
4172 const size_t ibegin( ( IsLower<MT1>::value )
4173 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4175 const size_t iend( ( IsUpper<MT1>::value )
4176 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4186 const IntrinsicType x1( x.load(i ) );
4187 const IntrinsicType x2( x.load(i1) );
4188 const IntrinsicType x3( x.load(i2) );
4189 const IntrinsicType x4( x.load(i3) );
4190 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4191 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4196 const IntrinsicType x1( x.load(i ) );
4197 const IntrinsicType x2( x.load(i1) );
4198 y[j ] +=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4199 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4203 const IntrinsicType x1( x.load(i) );
4204 y[j ] +=
sum( x1 * A.load(i,j ) ) * scalar;
4205 y[j+1UL] +=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4211 const size_t ibegin( ( IsLower<MT1>::value )
4212 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4214 const size_t iend( ( IsUpper<MT1>::value )
4215 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4225 const IntrinsicType x1( x.load(i ) );
4226 const IntrinsicType x2( x.load(i1) );
4227 const IntrinsicType x3( x.load(i2) );
4228 const IntrinsicType x4( x.load(i3) );
4229 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4234 const IntrinsicType x1( x.load(i ) );
4235 const IntrinsicType x2( x.load(i1) );
4236 y[j] +=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4240 const IntrinsicType x1( x.load(i) );
4241 y[j] +=
sum( x1 * A.load(i,j) ) * scalar;
4262 template<
typename VT1
4266 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4267 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4269 selectLargeAddAssignKernel( y, x, A, scalar );
4288 template<
typename VT1
4292 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
4293 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4295 if( IsTriangular<MT1>::value ) {
4297 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4301 sgemv( y, x, A, scalar, 1.0F );
4322 template<
typename VT1
4326 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
4327 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4329 if( IsTriangular<MT1>::value ) {
4331 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4335 dgemv( y, x, A, scalar, 1.0 );
4357 template<
typename VT1
4361 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
4362 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4364 if( IsTriangular<MT1>::value ) {
4366 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4370 cgemv( y, x, A, complex<float>( scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
4392 template<
typename VT1
4396 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
4397 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4399 if( IsTriangular<MT1>::value ) {
4401 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4405 zgemv( y, x, A, complex<double>( scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
4427 template<
typename VT1
4429 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
4435 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
4436 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
4438 if( right.rows() == 0UL || right.columns() == 0UL ) {
4450 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
4465 template<
typename VT1
4469 static inline void selectSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4471 if( ( IsDiagonal<MT1>::value ) ||
4472 ( IsComputation<MT>::value && !evaluateMatrix ) ||
4474 selectSmallSubAssignKernel( y, x, A, scalar );
4476 selectBlasSubAssignKernel( y, x, A, scalar );
4494 template<
typename VT1
4498 static inline void selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4500 y.subAssign( x * A * scalar );
4518 template<
typename VT1
4522 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4523 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4525 selectDefaultSubAssignKernel( y, x, A, scalar );
4544 template<
typename VT1
4548 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4549 selectSmallSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4551 typedef IntrinsicTrait<ElementType> IT;
4553 const size_t M( A.rows() );
4554 const size_t N( A.columns() );
4558 for( ; (j+8UL) <= N; j+=8UL )
4560 const size_t ibegin( ( IsLower<MT1>::value )
4561 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4563 const size_t iend( ( IsUpper<MT1>::value )
4564 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4568 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
4570 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4571 const IntrinsicType x1( x.load(i) );
4572 xmm1 = xmm1 + x1 * A.load(i,j );
4573 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4574 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4575 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4576 xmm5 = xmm5 + x1 * A.load(i,j+4UL);
4577 xmm6 = xmm6 + x1 * A.load(i,j+5UL);
4578 xmm7 = xmm7 + x1 * A.load(i,j+6UL);
4579 xmm8 = xmm8 + x1 * A.load(i,j+7UL);
4582 y[j ] -=
sum( xmm1 ) * scalar;
4583 y[j+1UL] -=
sum( xmm2 ) * scalar;
4584 y[j+2UL] -=
sum( xmm3 ) * scalar;
4585 y[j+3UL] -=
sum( xmm4 ) * scalar;
4586 y[j+4UL] -=
sum( xmm5 ) * scalar;
4587 y[j+5UL] -=
sum( xmm6 ) * scalar;
4588 y[j+6UL] -=
sum( xmm7 ) * scalar;
4589 y[j+7UL] -=
sum( xmm8 ) * scalar;
4592 for( ; (j+4UL) <= N; j+=4UL )
4594 const size_t ibegin( ( IsLower<MT1>::value )
4595 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4597 const size_t iend( ( IsUpper<MT1>::value )
4598 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4602 IntrinsicType xmm1, xmm2, xmm3, xmm4;
4604 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4605 const IntrinsicType x1( x.load(i) );
4606 xmm1 = xmm1 + x1 * A.load(i,j );
4607 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4608 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4609 xmm4 = xmm4 + x1 * A.load(i,j+3UL);
4612 y[j ] -=
sum( xmm1 ) * scalar;
4613 y[j+1UL] -=
sum( xmm2 ) * scalar;
4614 y[j+2UL] -=
sum( xmm3 ) * scalar;
4615 y[j+3UL] -=
sum( xmm4 ) * scalar;
4618 for( ; (j+3UL) <= N; j+=3UL )
4620 const size_t ibegin( ( IsLower<MT1>::value )
4621 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4623 const size_t iend( ( IsUpper<MT1>::value )
4624 ?( IsStrictlyUpper<MT1>::value ? j+2UL : j+3UL )
4628 IntrinsicType xmm1, xmm2, xmm3;
4630 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4631 const IntrinsicType x1( x.load(i) );
4632 xmm1 = xmm1 + x1 * A.load(i,j );
4633 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4634 xmm3 = xmm3 + x1 * A.load(i,j+2UL);
4637 y[j ] -=
sum( xmm1 ) * scalar;
4638 y[j+1UL] -=
sum( xmm2 ) * scalar;
4639 y[j+2UL] -=
sum( xmm3 ) * scalar;
4642 for( ; (j+2UL) <= N; j+=2UL )
4644 const size_t ibegin( ( IsLower<MT1>::value )
4645 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4647 const size_t iend( ( IsUpper<MT1>::value )
4648 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4652 IntrinsicType xmm1, xmm2;
4654 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4655 const IntrinsicType x1( x.load(i) );
4656 xmm1 = xmm1 + x1 * A.load(i,j );
4657 xmm2 = xmm2 + x1 * A.load(i,j+1UL);
4660 y[j ] -=
sum( xmm1 ) * scalar;
4661 y[j+1UL] -=
sum( xmm2 ) * scalar;
4666 const size_t ibegin( ( IsLower<MT1>::value )
4667 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4669 const size_t iend( ( IsUpper<MT1>::value )
4670 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4676 for(
size_t i=ibegin; i<iend; i+=
IT::size ) {
4677 xmm1 = xmm1 + A.load(i,j) * x.load(i);
4680 y[j] -=
sum( xmm1 ) * scalar;
4699 template<
typename VT1
4703 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4704 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4706 selectDefaultSubAssignKernel( y, x, A, scalar );
4725 template<
typename VT1
4729 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4730 selectLargeSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4732 typedef IntrinsicTrait<ElementType> IT;
4734 const size_t M( A.rows() );
4735 const size_t N( A.columns() );
4739 for( ; (j+8UL) <= N; j+=8UL )
4741 const size_t ibegin( ( IsLower<MT1>::value )
4742 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4744 const size_t iend( ( IsUpper<MT1>::value )
4745 ?( IsStrictlyUpper<MT1>::value ? j+7UL : j+8UL )
4755 const IntrinsicType x1( x.load(i ) );
4756 const IntrinsicType x2( x.load(i1) );
4757 const IntrinsicType x3( x.load(i2) );
4758 const IntrinsicType x4( x.load(i3) );
4759 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4760 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4761 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4762 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4763 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) + x3 * A.load(i2,j+4UL) + x4 * A.load(i3,j+4UL) ) * scalar;
4764 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) + x3 * A.load(i2,j+5UL) + x4 * A.load(i3,j+5UL) ) * scalar;
4765 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) + x3 * A.load(i2,j+6UL) + x4 * A.load(i3,j+6UL) ) * scalar;
4766 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) + x3 * A.load(i2,j+7UL) + x4 * A.load(i3,j+7UL) ) * scalar;
4771 const IntrinsicType x1( x.load(i ) );
4772 const IntrinsicType x2( x.load(i1) );
4773 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4774 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4775 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4776 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4777 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) + x2 * A.load(i1,j+4UL) ) * scalar;
4778 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) + x2 * A.load(i1,j+5UL) ) * scalar;
4779 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) + x2 * A.load(i1,j+6UL) ) * scalar;
4780 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) + x2 * A.load(i1,j+7UL) ) * scalar;
4784 const IntrinsicType x1( x.load(i) );
4785 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4786 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4787 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4788 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4789 y[j+4UL] -=
sum( x1 * A.load(i,j+4UL) ) * scalar;
4790 y[j+5UL] -=
sum( x1 * A.load(i,j+5UL) ) * scalar;
4791 y[j+6UL] -=
sum( x1 * A.load(i,j+6UL) ) * scalar;
4792 y[j+7UL] -=
sum( x1 * A.load(i,j+7UL) ) * scalar;
4796 for( ; (j+4UL) <= N; j+=4UL )
4798 const size_t ibegin( ( IsLower<MT1>::value )
4799 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4801 const size_t iend( ( IsUpper<MT1>::value )
4802 ?( IsStrictlyUpper<MT1>::value ? j+3UL : j+4UL )
4812 const IntrinsicType x1( x.load(i ) );
4813 const IntrinsicType x2( x.load(i1) );
4814 const IntrinsicType x3( x.load(i2) );
4815 const IntrinsicType x4( x.load(i3) );
4816 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4817 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4818 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) + x3 * A.load(i2,j+2UL) + x4 * A.load(i3,j+2UL) ) * scalar;
4819 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) + x3 * A.load(i2,j+3UL) + x4 * A.load(i3,j+3UL) ) * scalar;
4824 const IntrinsicType x1( x.load(i ) );
4825 const IntrinsicType x2( x.load(i1) );
4826 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4827 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4828 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) + x2 * A.load(i1,j+2UL) ) * scalar;
4829 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) + x2 * A.load(i1,j+3UL) ) * scalar;
4833 const IntrinsicType x1( x.load(i) );
4834 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4835 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4836 y[j+2UL] -=
sum( x1 * A.load(i,j+2UL) ) * scalar;
4837 y[j+3UL] -=
sum( x1 * A.load(i,j+3UL) ) * scalar;
4841 for( ; (j+2UL) <= N; j+=2UL )
4843 const size_t ibegin( ( IsLower<MT1>::value )
4844 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4846 const size_t iend( ( IsUpper<MT1>::value )
4847 ?( IsStrictlyUpper<MT1>::value ? j+1UL : j+2UL )
4857 const IntrinsicType x1( x.load(i ) );
4858 const IntrinsicType x2( x.load(i1) );
4859 const IntrinsicType x3( x.load(i2) );
4860 const IntrinsicType x4( x.load(i3) );
4861 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) + x3 * A.load(i2,j ) + x4 * A.load(i3,j ) ) * scalar;
4862 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) + x3 * A.load(i2,j+1UL) + x4 * A.load(i3,j+1UL) ) * scalar;
4867 const IntrinsicType x1( x.load(i ) );
4868 const IntrinsicType x2( x.load(i1) );
4869 y[j ] -=
sum( x1 * A.load(i,j ) + x2 * A.load(i1,j ) ) * scalar;
4870 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) + x2 * A.load(i1,j+1UL) ) * scalar;
4874 const IntrinsicType x1( x.load(i) );
4875 y[j ] -=
sum( x1 * A.load(i,j ) ) * scalar;
4876 y[j+1UL] -=
sum( x1 * A.load(i,j+1UL) ) * scalar;
4882 const size_t ibegin( ( IsLower<MT1>::value )
4883 ?( ( IsStrictlyLower<MT1>::value ? j+1UL : j ) &
size_t(-
IT::size) )
4885 const size_t iend( ( IsUpper<MT1>::value )
4886 ?( IsStrictlyUpper<MT1>::value ? j : j+1UL )
4896 const IntrinsicType x1( x.load(i ) );
4897 const IntrinsicType x2( x.load(i1) );
4898 const IntrinsicType x3( x.load(i2) );
4899 const IntrinsicType x4( x.load(i3) );
4900 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) + x3 * A.load(i2,j) + x4 * A.load(i3,j) ) * scalar;
4905 const IntrinsicType x1( x.load(i ) );
4906 const IntrinsicType x2( x.load(i1) );
4907 y[j] -=
sum( x1 * A.load(i,j) + x2 * A.load(i1,j) ) * scalar;
4911 const IntrinsicType x1( x.load(i) );
4912 y[j] -=
sum( x1 * A.load(i,j) ) * scalar;
4933 template<
typename VT1
4937 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
4938 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4940 selectLargeSubAssignKernel( y, x, A, scalar );
4959 template<
typename VT1
4963 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
4964 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
4966 if( IsTriangular<MT1>::value ) {
4968 strmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
4972 sgemv( y, x, A, -scalar, 1.0F );
4993 template<
typename VT1
4997 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
4998 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5000 if( IsTriangular<MT1>::value ) {
5002 dtrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
5006 dgemv( y, x, A, -scalar, 1.0 );
5029 template<
typename VT1
5033 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
5034 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5036 if( IsTriangular<MT1>::value ) {
5038 ctrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
5042 cgemv( y, x, A, complex<float>( -scalar, 0.0F ), complex<float>( 1.0F, 0.0F ) );
5065 template<
typename VT1
5069 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
5070 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
5072 if( IsTriangular<MT1>::value ) {
5074 ztrmv( tmp, A, ( IsLower<MT1>::value )?( CblasLower ):( CblasUpper ) );
5078 zgemv( y, x, A, complex<double>( -scalar, 0.0 ), complex<double>( 1.0, 0.0 ) );
5100 template<
typename VT1
5102 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5112 const ResultType tmp(
serial( rhs ) );
5135 template<
typename VT1
5137 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5138 smpAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5144 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5145 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5147 if( right.rows() == 0UL ) {
5151 else if( right.columns() == 0UL ) {
5181 template<
typename VT1
5183 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5184 smpAssign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5194 const ResultType tmp( rhs );
5213 template<
typename VT1
5215 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5216 smpAddAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5222 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5223 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5225 if( right.rows() == 0UL || right.columns() == 0UL ) {
5259 template<
typename VT1
5261 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5262 smpSubAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5268 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
5269 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
5271 if( right.rows() == 0UL || right.columns() == 0UL ) {
5305 template<
typename VT1
5307 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
5308 smpMultAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
5318 const ResultType tmp( rhs );
5381 template<
typename T1
5383 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
5388 if( (~vec).
size() != (~mat).
rows() )
5389 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
5406 template<
typename MT,
typename VT >
5408 :
public Columns<MT>
5424 template<
typename VT,
typename MT,
bool AF >
5429 typedef typename MultExprTrait< typename SubvectorExprTrait<const VT,AF>::Type
5430 ,
typename SubmatrixExprTrait<const MT,AF>::Type >::Type Type;
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDVecTDMatMultExpr.h:397
BLAZE_ALWAYS_INLINE int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:63
Expression object for transpose dense vector-transpose dense matrix multiplications.The TDVecTDMatMultExpr class represents the compile time expression for multiplications between transpose dense vectors and column-major dense matrices.
Definition: Forward.h:135
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
BLAZE_ALWAYS_INLINE void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:879
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:8247
RightOperand rightOperand() const
Returns the right-hand side transpose dense matrix operand.
Definition: TDVecTDMatMultExpr.h:385
Header file for basic type definitions.
BLAZE_ALWAYS_INLINE size_t size(const Vector< VT, TF > &vector)
Returns the current size/dimension of the vector.
Definition: Vector.h:264
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:209
Header file for the IsDiagonal type trait.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type LeftOperand
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:280
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the IsSame and IsStrictlySame type traits.
const size_t TDVECTDMATMULT_THRESHOLD
Dense Vector/column-major dense matrix multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:108
void reset(const DiagonalProxy< MT > &proxy)
Resetting the represented element to the default initial values.
Definition: DiagonalProxy.h:821
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2507
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:261
BLAZE_ALWAYS_INLINE size_t rows(const Matrix< MT, SO > &matrix)
Returns the current number of rows of the matrix.
Definition: Matrix.h:316
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDVecTDMatMultExpr.h:277
Header file for the DenseVector base class.
Compile time check for lower triangular matrices.This type trait tests whether or not the given templ...
Definition: IsLower.h:90
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDVecTDMatMultExpr.h:365
LeftOperand vec_
Left-hand side dense vector of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:439
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:276
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:699
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Compile time check for upper triangular matrices.This type trait tests whether or not the given templ...
Definition: IsUpper.h:90
Header file for the RequiresEvaluation type trait.
SelectType< evaluateMatrix, const MRT, MCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:289
TDVecTDMatMultExpr(const VT &vec, const MT &mat)
Constructor for the TDVecTDMatMultExpr class.
Definition: TDVecTDMatMultExpr.h:311
Header file for the VecScalarMultExpr base class.
VT::ResultType VRT
Result type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:118
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
MRT::ElementType MET
Element type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:121
SelectType< evaluateVector, const VRT, VCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDVecTDMatMultExpr.h:286
Constraint on the data type.
Header file for the MultExprTrait class template.
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:263
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsStrictlyUpper type trait.
Namespace of the Blaze C++ math library.
Definition: Blaze.h:57
Header file for the IsDouble type trait.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDVecTDMatMultExpr.h:325
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:273
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDVecTDMatMultExpr.h:275
Header file for the IsMatMatMultExpr type trait class.
BLAZE_ALWAYS_INLINE void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:635
Header file for the Columns type trait.
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDVecTDMatMultExpr.h:429
Header file for the IsBlasCompatible type trait.
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
Header file for the IsLower type trait.
TDVecTDMatMultExpr< VT, MT > This
Type of this TDVecTDMatMultExpr instance.
Definition: TDVecTDMatMultExpr.h:271
Compile time check for diagonal matrices.This type trait tests whether or not the given template para...
Definition: IsDiagonal.h:92
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
Header file for the IsTriangular type trait.
Constraints on the storage order of matrix types.
Constraint on the data type.
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type RightOperand
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:283
Compile time check for strictly upper triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyUpper.h:86
VRT::ElementType VET
Element type of the left-hand side dense vector epxression.
Definition: TDVecTDMatMultExpr.h:120
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2505
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
Header file for the EnableIf class template.
Header file for the IsStrictlyLower type trait.
Header file for the serial shim.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDVecTDMatMultExpr.h:274
Header file for the IsNumeric type trait.
Header file for the HasConstDataAccess type trait.
Header file for BLAS level 2 functions.
System settings for the BLAS mode.
EnableIf< IsDenseMatrix< MT1 > >::Type smpSubAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:160
Header file for the SubmatrixExprTrait class template.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:749
MT::ResultType MRT
Result type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:119
Header file for run time assertion macros.
EnableIf< IsDenseMatrix< MT1 > >::Type smpAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:98
Base template for the MultTrait class.
Definition: MultTrait.h:150
RightOperand mat_
Right-hand side dense matrix of the multiplication expression.
Definition: TDVecTDMatMultExpr.h:440
BLAZE_ALWAYS_INLINE void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:742
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Constraint on the data type.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
Header file for the TVecMatMultExpr base class.
Header file for the HasMutableDataAccess type trait.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDVecTDMatMultExpr.h:419
Header file for all intrinsic functionality.
MultTrait< VRT, MRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDVecTDMatMultExpr.h:272
Compile time check for strictly lower triangular matrices.This type trait tests whether or not the gi...
Definition: IsStrictlyLower.h:86
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_FORM_VALID_TVECMATMULTEXPR(T1, T2)
Constraint on the data type.In case the given data types T1 and T2 do not form a valid vector/matrix ...
Definition: TVecMatMultExpr.h:166
VT::CompositeType VCT
Composite type of the left-hand side dense vector expression.
Definition: TDVecTDMatMultExpr.h:122
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:260
EnableIf< IsDenseMatrix< MT1 > >::Type smpAddAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:129
const size_t TDVECDMATMULT_THRESHOLD
Dense Vector/row-major dense matrix multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:91
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2502
#define BLAZE_CONSTRAINT_MUST_BE_ROW_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a row dense or sparse vector type (i...
Definition: TransposeFlag.h:81
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the IsUpper type trait.
MT::CompositeType MCT
Composite type of the right-hand side dense matrix expression.
Definition: TDVecTDMatMultExpr.h:123
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDVecTDMatMultExpr.h:409
const size_t SMP_TDVECTDMATMULT_THRESHOLD
SMP dense vector/column-major dense matrix multiplication threshold.This threshold specifies when a d...
Definition: Thresholds.h:391
EnableIf< IsDenseVector< VT1 > >::Type smpMultAssign(Vector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:189
Header file for the Size type trait.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
LeftOperand leftOperand() const
Returns the left-hand side dense vector operand.
Definition: TDVecTDMatMultExpr.h:375
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
BLAZE_ALWAYS_INLINE void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:849