22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
31 #include <boost/cast.hpp>
86 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
87 ,
private MatVecMultExpr
92 typedef typename MT::ResultType
MRT;
93 typedef typename VT::ResultType
VRT;
94 typedef typename MRT::ElementType
MET;
95 typedef typename VRT::ElementType
VET;
96 typedef typename MT::CompositeType
MCT;
97 typedef typename VT::CompositeType
VCT;
112 template<
typename T1,
typename T2,
typename T3 >
113 struct UseSinglePrecisionKernel {
114 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
128 template<
typename T1,
typename T2,
typename T3 >
129 struct UseDoublePrecisionKernel {
130 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
131 IsDouble<typename T1::ElementType>::value &&
132 IsDouble<typename T2::ElementType>::value &&
133 IsDouble<typename T3::ElementType>::value };
144 template<
typename T1,
typename T2,
typename T3 >
145 struct UseSinglePrecisionComplexKernel {
146 typedef complex<float> Type;
147 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
148 IsSame<typename T1::ElementType,Type>::value &&
149 IsSame<typename T2::ElementType,Type>::value &&
150 IsSame<typename T3::ElementType,Type>::value };
161 template<
typename T1,
typename T2,
typename T3 >
162 struct UseDoublePrecisionComplexKernel {
163 typedef complex<double> Type;
164 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
165 IsSame<typename T1::ElementType,Type>::value &&
166 IsSame<typename T2::ElementType,Type>::value &&
167 IsSame<typename T3::ElementType,Type>::value };
177 template<
typename T1,
typename T2,
typename T3 >
178 struct UseDefaultKernel {
179 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
180 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
181 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
182 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
193 template<
typename T1,
typename T2,
typename T3 >
194 struct UseVectorizedDefaultKernel {
195 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
196 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
197 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
198 IntrinsicTrait<typename T1::ElementType>::addition &&
199 IntrinsicTrait<typename T1::ElementType>::multiplication };
229 enum { vectorizable = 0 };
258 if(
mat_.columns() != 0UL ) {
260 for(
size_t j=1UL; j<
end_; j+=2UL ) {
263 if( end_ <
mat_.columns() ) {
311 template<
typename T >
313 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
323 template<
typename T >
325 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
348 template<
typename VT1 >
355 if( rhs.mat_.rows() == 0UL ) {
358 else if( rhs.mat_.columns() == 0UL ) {
373 TDMatDVecMultExpr::selectDefaultAssignKernel( ~lhs, A, x );
375 TDMatDVecMultExpr::selectBlasAssignKernel( ~lhs, A, x );
394 template<
typename VT1
398 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
400 const size_t M( A.rows() );
401 const size_t N( A.columns() );
404 const size_t iend( M &
size_t(-2) );
406 for(
size_t i=0UL; i<M; ++i ) {
407 y[i] = x[0UL] * A(i,0UL);
409 for(
size_t j=1UL; j<N; ++j ) {
410 for(
size_t i=0UL; i<iend; i+=2UL ) {
411 y[i ] += x[j] * A(i ,j);
412 y[i+1UL] += x[j] * A(i+1UL,j);
415 y[iend] += x[j] * A(iend,j);
436 template<
typename VT1
439 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
440 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
442 typedef IntrinsicTrait<ElementType> IT;
444 const size_t M( A.rows() );
445 const size_t N( A.columns() );
449 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
450 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
451 for(
size_t j=0UL; j<N; ++j ) {
453 xmm1 = xmm1 + A.get(i ,j) * x1;
454 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
455 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
456 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
457 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
458 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
459 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
460 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
462 store( &y[i ], xmm1 );
463 store( &y[i+IT::size ], xmm2 );
464 store( &y[i+IT::size*2UL], xmm3 );
465 store( &y[i+IT::size*3UL], xmm4 );
466 store( &y[i+IT::size*4UL], xmm5 );
467 store( &y[i+IT::size*5UL], xmm6 );
468 store( &y[i+IT::size*6UL], xmm7 );
469 store( &y[i+IT::size*7UL], xmm8 );
471 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
473 for(
size_t j=0UL; j<N; ++j ) {
475 xmm1 = xmm1 + A.get(i ,j) * x1;
476 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
477 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
478 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
480 store( &y[i ], xmm1 );
481 store( &y[i+IT::size ], xmm2 );
482 store( &y[i+IT::size*2UL], xmm3 );
483 store( &y[i+IT::size*3UL], xmm4 );
485 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
487 for(
size_t j=0UL; j<N; ++j ) {
489 xmm1 = xmm1 + A.get(i ,j) * x1;
490 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
491 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
493 store( &y[i ], xmm1 );
494 store( &y[i+IT::size ], xmm2 );
495 store( &y[i+IT::size*2UL], xmm3 );
497 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
499 for(
size_t j=0UL; j<N; ++j ) {
501 xmm1 = xmm1 + A.get(i ,j) * x1;
502 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
504 store( &y[i ], xmm1 );
505 store( &y[i+IT::size], xmm2 );
509 for(
size_t j=0UL; j<N; ++j ) {
510 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
512 store( &y[i], xmm1 );
532 template<
typename VT1
535 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
536 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
538 selectDefaultAssignKernel( y, A, x );
558 template<
typename VT1
561 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
562 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
564 using boost::numeric_cast;
570 const int M ( numeric_cast<int>( A.rows() ) );
571 const int N ( numeric_cast<int>( A.columns() ) );
572 const int lda( numeric_cast<int>( A.spacing() ) );
574 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
575 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
596 template<
typename VT1
599 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
600 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
602 using boost::numeric_cast;
608 const int M ( numeric_cast<int>( A.rows() ) );
609 const int N ( numeric_cast<int>( A.columns() ) );
610 const int lda( numeric_cast<int>( A.spacing() ) );
612 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
613 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
634 template<
typename VT1
637 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
638 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
640 using boost::numeric_cast;
649 const int M ( numeric_cast<int>( A.rows() ) );
650 const int N ( numeric_cast<int>( A.columns() ) );
651 const int lda( numeric_cast<int>( A.spacing() ) );
652 const complex<float> alpha( 1.0F, 0.0F );
653 const complex<float> beta ( 0.0F, 0.0F );
655 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
656 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
677 template<
typename VT1
680 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
681 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
683 using boost::numeric_cast;
692 const int M ( numeric_cast<int>( A.rows() ) );
693 const int N ( numeric_cast<int>( A.columns() ) );
694 const int lda( numeric_cast<int>( A.spacing() ) );
695 const complex<double> alpha( 1.0, 0.0 );
696 const complex<double> beta ( 0.0, 0.0 );
698 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
699 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
717 template<
typename VT1 >
747 template<
typename VT1 >
754 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
766 if( ( IsComputation<MT>::value && !evaluate ) ||
768 TDMatDVecMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x );
770 TDMatDVecMultExpr::selectBlasAddAssignKernel( ~lhs, A, x );
789 template<
typename VT1
792 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
793 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
795 const size_t M( A.rows() );
796 const size_t N( A.columns() );
799 const size_t iend( M &
size_t(-2) );
801 for(
size_t j=0UL; j<N; ++j ) {
802 for(
size_t i=0UL; i<iend; i+=2UL ) {
803 y[i ] += x[j] * A(i ,j);
804 y[i+1UL] += x[j] * A(i+1UL,j);
807 y[iend] += x[j] * A(iend,j);
828 template<
typename VT1
831 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
832 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
834 typedef IntrinsicTrait<ElementType> IT;
836 const size_t M( A.rows() );
837 const size_t N( A.columns() );
841 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
850 for(
size_t j=0UL; j<N; ++j ) {
852 xmm1 = xmm1 + A.get(i ,j) * x1;
853 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
854 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
855 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
856 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
857 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
858 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
859 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
861 store( &y[i ], xmm1 );
862 store( &y[i+IT::size ], xmm2 );
863 store( &y[i+IT::size*2UL], xmm3 );
864 store( &y[i+IT::size*3UL], xmm4 );
865 store( &y[i+IT::size*4UL], xmm5 );
866 store( &y[i+IT::size*5UL], xmm6 );
867 store( &y[i+IT::size*6UL], xmm7 );
868 store( &y[i+IT::size*7UL], xmm8 );
870 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
875 for(
size_t j=0UL; j<N; ++j ) {
877 xmm1 = xmm1 + A.get(i ,j) * x1;
878 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
879 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
880 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
882 store( &y[i ], xmm1 );
883 store( &y[i+IT::size ], xmm2 );
884 store( &y[i+IT::size*2UL], xmm3 );
885 store( &y[i+IT::size*3UL], xmm4 );
887 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
891 for(
size_t j=0UL; j<N; ++j ) {
893 xmm1 = xmm1 + A.get(i ,j) * x1;
894 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
895 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
897 store( &y[i ], xmm1 );
898 store( &y[i+IT::size ], xmm2 );
899 store( &y[i+IT::size*2UL], xmm3 );
901 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
904 for(
size_t j=0UL; j<N; ++j ) {
906 xmm1 = xmm1 + A.get(i ,j) * x1;
907 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
909 store( &y[i ], xmm1 );
910 store( &y[i+IT::size], xmm2 );
914 for(
size_t j=0UL; j<N; ++j ) {
915 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
917 store( &y[i], xmm1 );
937 template<
typename VT1
940 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
941 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
943 selectDefaultAddAssignKernel( y, A, x );
963 template<
typename VT1
966 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
967 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
969 using boost::numeric_cast;
975 const int M ( numeric_cast<int>( A.rows() ) );
976 const int N ( numeric_cast<int>( A.columns() ) );
977 const int lda( numeric_cast<int>( A.spacing() ) );
979 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
980 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1001 template<
typename VT1
1004 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1005 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1007 using boost::numeric_cast;
1013 const int M ( numeric_cast<int>( A.rows() ) );
1014 const int N ( numeric_cast<int>( A.columns() ) );
1015 const int lda( numeric_cast<int>( A.spacing() ) );
1017 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1018 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1039 template<
typename VT1
1042 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1043 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1045 using boost::numeric_cast;
1054 const int M ( numeric_cast<int>( A.rows() ) );
1055 const int N ( numeric_cast<int>( A.columns() ) );
1056 const int lda( numeric_cast<int>( A.spacing() ) );
1057 const complex<float> alpha( 1.0F, 0.0F );
1058 const complex<float> beta ( 1.0F, 0.0F );
1060 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1061 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1082 template<
typename VT1
1085 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1086 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1088 using boost::numeric_cast;
1097 const int M ( numeric_cast<int>( A.rows() ) );
1098 const int N ( numeric_cast<int>( A.columns() ) );
1099 const int lda( numeric_cast<int>( A.spacing() ) );
1100 const complex<double> alpha( 1.0, 0.0 );
1101 const complex<double> beta ( 1.0, 0.0 );
1103 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1104 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1127 template<
typename VT1 >
1134 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1146 if( ( IsComputation<MT>::value && !evaluate ) ||
1148 TDMatDVecMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x );
1150 TDMatDVecMultExpr::selectBlasSubAssignKernel( ~lhs, A, x );
1169 template<
typename VT1
1172 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1173 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1175 const size_t M( A.rows() );
1176 const size_t N( A.columns() );
1179 const size_t iend( M &
size_t(-2) );
1181 for(
size_t j=0UL; j<N; ++j ) {
1182 for(
size_t i=0UL; i<iend; i+=2UL ) {
1183 y[i ] -= x[j] * A(i ,j);
1184 y[i+1UL] -= x[j] * A(i+1UL,j);
1187 y[iend] -= x[j] * A(iend,j);
1208 template<
typename VT1
1211 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1212 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1214 typedef IntrinsicTrait<ElementType> IT;
1216 const size_t M( A.rows() );
1217 const size_t N( A.columns() );
1221 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1230 for(
size_t j=0UL; j<N; ++j ) {
1232 xmm1 = xmm1 - A.get(i ,j) * x1;
1233 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1234 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1235 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1236 xmm5 = xmm5 - A.get(i+IT::size*4UL,j) * x1;
1237 xmm6 = xmm6 - A.get(i+IT::size*5UL,j) * x1;
1238 xmm7 = xmm7 - A.get(i+IT::size*6UL,j) * x1;
1239 xmm8 = xmm8 - A.get(i+IT::size*7UL,j) * x1;
1241 store( &y[i ], xmm1 );
1242 store( &y[i+IT::size ], xmm2 );
1243 store( &y[i+IT::size*2UL], xmm3 );
1244 store( &y[i+IT::size*3UL], xmm4 );
1245 store( &y[i+IT::size*4UL], xmm5 );
1246 store( &y[i+IT::size*5UL], xmm6 );
1247 store( &y[i+IT::size*6UL], xmm7 );
1248 store( &y[i+IT::size*7UL], xmm8 );
1250 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1255 for(
size_t j=0UL; j<N; ++j ) {
1257 xmm1 = xmm1 - A.get(i ,j) * x1;
1258 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1259 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1260 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1262 store( &y[i ], xmm1 );
1263 store( &y[i+IT::size ], xmm2 );
1264 store( &y[i+IT::size*2UL], xmm3 );
1265 store( &y[i+IT::size*3UL], xmm4 );
1267 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1271 for(
size_t j=0UL; j<N; ++j ) {
1273 xmm1 = xmm1 - A.get(i ,j) * x1;
1274 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1275 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1277 store( &y[i ], xmm1 );
1278 store( &y[i+IT::size ], xmm2 );
1279 store( &y[i+IT::size*2UL], xmm3 );
1281 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1284 for(
size_t j=0UL; j<N; ++j ) {
1286 xmm1 = xmm1 - A.get(i ,j) * x1;
1287 xmm2 = xmm2 - A.get(i+IT::size,j) * x1;
1289 store( &y[i ], xmm1 );
1290 store( &y[i+IT::size], xmm2 );
1294 for(
size_t j=0UL; j<N; ++j ) {
1295 xmm1 = xmm1 - A.get(i,j) *
set( x[j] );
1297 store( &y[i], xmm1 );
1317 template<
typename VT1
1320 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1321 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1323 selectDefaultSubAssignKernel( y, A, x );
1343 template<
typename VT1
1346 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1347 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1349 using boost::numeric_cast;
1355 const int M ( numeric_cast<int>( A.rows() ) );
1356 const int N ( numeric_cast<int>( A.columns() ) );
1357 const int lda( numeric_cast<int>( A.spacing() ) );
1359 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1360 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1381 template<
typename VT1
1384 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1385 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1387 using boost::numeric_cast;
1393 const int M ( numeric_cast<int>( A.rows() ) );
1394 const int N ( numeric_cast<int>( A.columns() ) );
1395 const int lda( numeric_cast<int>( A.spacing() ) );
1397 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1398 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1419 template<
typename VT1
1422 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1423 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1425 using boost::numeric_cast;
1434 const int M ( numeric_cast<int>( A.rows() ) );
1435 const int N ( numeric_cast<int>( A.columns() ) );
1436 const int lda( numeric_cast<int>( A.spacing() ) );
1437 const complex<float> alpha( -1.0F, 0.0F );
1438 const complex<float> beta ( 1.0F, 0.0F );
1440 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1441 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1462 template<
typename VT1
1465 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1466 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1468 using boost::numeric_cast;
1477 const int M ( numeric_cast<int>( A.rows() ) );
1478 const int N ( numeric_cast<int>( A.columns() ) );
1479 const int lda( numeric_cast<int>( A.spacing() ) );
1480 const complex<double> alpha( -1.0, 0.0 );
1481 const complex<double> beta ( 1.0, 0.0 );
1483 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1484 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1507 template<
typename VT1 >
1557 template<
typename MT
1561 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1562 ,
private VecScalarMultExpr
1563 ,
private Computation
1567 typedef TDMatDVecMultExpr<MT,VT> MVM;
1568 typedef typename MVM::ResultType RES;
1569 typedef typename MT::ResultType
MRT;
1570 typedef typename VT::ResultType
VRT;
1571 typedef typename MRT::ElementType
MET;
1572 typedef typename VRT::ElementType
VET;
1573 typedef typename MT::CompositeType
MCT;
1574 typedef typename VT::CompositeType
VCT;
1579 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1580 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1588 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1589 struct UseSinglePrecisionKernel {
1590 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1591 IsFloat<typename T1::ElementType>::value &&
1592 IsFloat<typename T2::ElementType>::value &&
1593 IsFloat<typename T3::ElementType>::value &&
1594 !IsComplex<T4>::value };
1603 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1604 struct UseDoublePrecisionKernel {
1605 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1606 IsDouble<typename T1::ElementType>::value &&
1607 IsDouble<typename T2::ElementType>::value &&
1608 IsDouble<typename T3::ElementType>::value &&
1609 !IsComplex<T4>::value };
1618 template<
typename T1,
typename T2,
typename T3 >
1619 struct UseSinglePrecisionComplexKernel {
1620 typedef complex<float> Type;
1621 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1622 IsSame<typename T1::ElementType,Type>::value &&
1623 IsSame<typename T2::ElementType,Type>::value &&
1624 IsSame<typename T3::ElementType,Type>::value };
1633 template<
typename T1,
typename T2,
typename T3 >
1634 struct UseDoublePrecisionComplexKernel {
1635 typedef complex<double> Type;
1636 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1637 IsSame<typename T1::ElementType,Type>::value &&
1638 IsSame<typename T2::ElementType,Type>::value &&
1639 IsSame<typename T3::ElementType,Type>::value };
1647 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1648 struct UseDefaultKernel {
1649 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1650 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1651 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1652 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1661 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1662 struct UseVectorizedDefaultKernel {
1663 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1664 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1665 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1666 IsSame<typename T1::ElementType,T4>::value &&
1667 IntrinsicTrait<typename T1::ElementType>::addition &&
1668 IntrinsicTrait<typename T1::ElementType>::multiplication };
1674 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1675 typedef typename MultTrait<RES,ST>::Type
ResultType;
1677 typedef typename ResultType::ElementType
ElementType;
1678 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1683 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
1689 typedef typename SelectType< evaluate, const MRT, MCT >::Type
LT;
1692 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
RT;
1697 enum { vectorizable = 0 };
1706 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1720 return vector_[index] * scalar_;
1729 inline size_t size()
const {
1730 return vector_.size();
1760 template<
typename T >
1761 inline bool canAlias(
const T* alias )
const {
1762 return vector_.canAlias( alias );
1772 template<
typename T >
1773 inline bool isAliased(
const T* alias )
const {
1774 return vector_.isAliased( alias );
1796 template<
typename VT1 >
1797 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
1803 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
1804 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
1806 if( left.rows() == 0UL ) {
1809 else if( left.columns() == 0UL ) {
1822 if( ( IsComputation<MT>::value && !evaluate ) ||
1824 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, x, rhs.scalar_ );
1826 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, A, x, rhs.scalar_ );
1844 template<
typename VT1
1848 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1849 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1851 const size_t M( A.rows() );
1852 const size_t N( A.columns() );
1855 const size_t iend( M &
size_t(-2) );
1857 for(
size_t i=0UL; i<M; ++i ) {
1858 y[i] = x[0UL] * A(i,0UL);
1860 for(
size_t j=1UL; j<N; ++j ) {
1861 for(
size_t i=0UL; i<iend; i+=2UL ) {
1862 y[i ] += x[j] * A(i ,j);
1863 y[i+1UL] += x[j] * A(i+1UL,j);
1866 y[iend] += x[j] * A(iend,j);
1869 for(
size_t i=0UL; i<M; ++i ) {
1889 template<
typename VT1
1893 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1894 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1896 typedef IntrinsicTrait<ElementType> IT;
1898 const size_t M( A.rows() );
1899 const size_t N( A.columns() );
1905 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1906 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1907 for(
size_t j=0UL; j<N; ++j ) {
1909 xmm1 = xmm1 + A.get(i ,j) * x1;
1910 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1911 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1912 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1913 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
1914 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
1915 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
1916 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
1918 store( &y[i ], xmm1*factor );
1919 store( &y[i+IT::size ], xmm2*factor );
1920 store( &y[i+IT::size*2UL], xmm3*factor );
1921 store( &y[i+IT::size*3UL], xmm4*factor );
1922 store( &y[i+IT::size*4UL], xmm5*factor );
1923 store( &y[i+IT::size*5UL], xmm6*factor );
1924 store( &y[i+IT::size*6UL], xmm7*factor );
1925 store( &y[i+IT::size*7UL], xmm8*factor );
1927 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1929 for(
size_t j=0UL; j<N; ++j ) {
1931 xmm1 = xmm1 + A.get(i ,j) * x1;
1932 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1933 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1934 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1936 store( &y[i ], xmm1*factor );
1937 store( &y[i+IT::size ], xmm2*factor );
1938 store( &y[i+IT::size*2UL], xmm3*factor );
1939 store( &y[i+IT::size*3UL], xmm4*factor );
1941 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1943 for(
size_t j=0UL; j<N; ++j ) {
1945 xmm1 = xmm1 + A.get(i ,j) * x1;
1946 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1947 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1949 store( &y[i ], xmm1*factor );
1950 store( &y[i+IT::size ], xmm2*factor );
1951 store( &y[i+IT::size*2UL], xmm3*factor );
1953 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1955 for(
size_t j=0UL; j<N; ++j ) {
1957 xmm1 = xmm1 + A.get(i ,j) * x1;
1958 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
1960 store( &y[i ], xmm1*factor );
1961 store( &y[i+IT::size], xmm2*factor );
1965 for(
size_t j=0UL; j<N; ++j ) {
1967 xmm1 = xmm1 + A.get(i,j) * x1;
1969 store( &y[i], xmm1*factor );
1988 template<
typename VT1
1992 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1993 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1995 selectDefaultAssignKernel( y, A, x, scalar );
2014 template<
typename VT1
2018 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2019 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2021 using boost::numeric_cast;
2027 const int M ( numeric_cast<int>( A.rows() ) );
2028 const int N ( numeric_cast<int>( A.columns() ) );
2029 const int lda( numeric_cast<int>( A.spacing() ) );
2031 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2032 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2052 template<
typename VT1
2056 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2057 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2059 using boost::numeric_cast;
2065 const int M ( numeric_cast<int>( A.rows() ) );
2066 const int N ( numeric_cast<int>( A.columns() ) );
2067 const int lda( numeric_cast<int>( A.spacing() ) );
2069 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2070 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2090 template<
typename VT1
2094 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2095 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2097 using boost::numeric_cast;
2106 const int M ( numeric_cast<int>( A.rows() ) );
2107 const int N ( numeric_cast<int>( A.columns() ) );
2108 const int lda( numeric_cast<int>( A.spacing() ) );
2109 const complex<float> alpha( scalar );
2110 const complex<float> beta ( 0.0F, 0.0F );
2112 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2113 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2133 template<
typename VT1
2137 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2138 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2140 using boost::numeric_cast;
2149 const int M ( numeric_cast<int>( A.rows() ) );
2150 const int N ( numeric_cast<int>( A.columns() ) );
2151 const int lda( numeric_cast<int>( A.spacing() ) );
2152 const complex<double> alpha( scalar );
2153 const complex<double> beta ( 0.0, 0.0 );
2155 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2156 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2173 template<
typename VT1 >
2174 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2201 template<
typename VT1 >
2202 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2208 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2209 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2211 if( left.rows() == 0UL || left.columns() == 0UL ) {
2223 if( ( IsComputation<MT>::value && !evaluate ) ||
2225 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2227 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2245 template<
typename VT1
2249 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2250 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2252 y.addAssign( A * x * scalar );
2270 template<
typename VT1
2274 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2275 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2277 typedef IntrinsicTrait<ElementType> IT;
2279 const size_t M( A.rows() );
2280 const size_t N( A.columns() );
2286 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2287 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2288 for(
size_t j=0UL; j<N; ++j ) {
2290 xmm1 = xmm1 + A.get(i ,j) * x1;
2291 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2292 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2293 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2294 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2295 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2296 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2297 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2299 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2300 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2301 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2302 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2303 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) + xmm5*factor );
2304 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) + xmm6*factor );
2305 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) + xmm7*factor );
2306 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) + xmm8*factor );
2308 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2310 for(
size_t j=0UL; j<N; ++j ) {
2312 xmm1 = xmm1 + A.get(i ,j) * x1;
2313 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2314 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2315 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2317 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2318 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2319 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2320 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2322 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2324 for(
size_t j=0UL; j<N; ++j ) {
2326 xmm1 = xmm1 + A.get(i ,j) * x1;
2327 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2328 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2330 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2331 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2332 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2334 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2336 for(
size_t j=0UL; j<N; ++j ) {
2338 xmm1 = xmm1 + A.get(i ,j) * x1;
2339 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2341 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2342 store( &y[i+IT::size],
load( &y[i+IT::size] ) + xmm2*factor );
2346 for(
size_t j=0UL; j<N; ++j ) {
2347 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2349 store( &y[i],
load( &y[i] ) + xmm1*factor );
2368 template<
typename VT1
2372 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2373 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2375 selectDefaultAddAssignKernel( y, A, x, scalar );
2394 template<
typename VT1
2398 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2399 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2401 using boost::numeric_cast;
2407 const int M ( numeric_cast<int>( A.rows() ) );
2408 const int N ( numeric_cast<int>( A.columns() ) );
2409 const int lda( numeric_cast<int>( A.spacing() ) );
2411 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2412 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2432 template<
typename VT1
2436 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2437 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2439 using boost::numeric_cast;
2445 const int M ( numeric_cast<int>( A.rows() ) );
2446 const int N ( numeric_cast<int>( A.columns() ) );
2447 const int lda( numeric_cast<int>( A.spacing() ) );
2449 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2450 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2470 template<
typename VT1
2474 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2475 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2477 using boost::numeric_cast;
2486 const int M ( numeric_cast<int>( A.rows() ) );
2487 const int N ( numeric_cast<int>( A.columns() ) );
2488 const int lda( numeric_cast<int>( A.spacing() ) );
2489 const complex<float> alpha( scalar );
2490 const complex<float> beta ( 1.0F, 0.0F );
2492 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2493 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2513 template<
typename VT1
2517 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2518 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2520 using boost::numeric_cast;
2529 const int M ( numeric_cast<int>( A.rows() ) );
2530 const int N ( numeric_cast<int>( A.columns() ) );
2531 const int lda( numeric_cast<int>( A.spacing() ) );
2532 const complex<double> alpha( scalar );
2533 const complex<double> beta ( 1.0, 0.0 );
2535 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2536 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2557 template<
typename VT1 >
2558 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2564 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2565 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2567 if( left.rows() == 0UL || left.columns() == 0UL ) {
2579 if( ( IsComputation<MT>::value && !evaluate ) ||
2581 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2583 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2601 template<
typename VT1
2605 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2606 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2608 y.subAssign( A * x * scalar );
2626 template<
typename VT1
2630 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2631 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2633 typedef IntrinsicTrait<ElementType> IT;
2635 const size_t M( A.rows() );
2636 const size_t N( A.columns() );
2642 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2643 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2644 for(
size_t j=0UL; j<N; ++j ) {
2646 xmm1 = xmm1 + A.get(i ,j) * x1;
2647 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2648 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2649 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2650 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2651 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2652 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2653 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2655 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2656 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2657 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2658 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2659 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) - xmm5*factor );
2660 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) - xmm6*factor );
2661 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) - xmm7*factor );
2662 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) - xmm8*factor );
2664 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2666 for(
size_t j=0UL; j<N; ++j ) {
2668 xmm1 = xmm1 + A.get(i ,j) * x1;
2669 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2670 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2671 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2673 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2674 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2675 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2676 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2678 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2680 for(
size_t j=0UL; j<N; ++j ) {
2682 xmm1 = xmm1 + A.get(i ,j) * x1;
2683 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2684 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2686 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2687 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2688 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2690 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2692 for(
size_t j=0UL; j<N; ++j ) {
2694 xmm1 = xmm1 + A.get(i ,j) * x1;
2695 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2697 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2698 store( &y[i+IT::size],
load( &y[i+IT::size] ) - xmm2*factor );
2702 for(
size_t j=0UL; j<N; ++j ) {
2703 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2705 store( &y[i],
load( &y[i] ) - xmm1*factor );
2724 template<
typename VT1
2728 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2729 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2731 selectDefaultSubAssignKernel( y, A, x, scalar );
2750 template<
typename VT1
2754 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2755 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2757 using boost::numeric_cast;
2763 const int M ( numeric_cast<int>( A.rows() ) );
2764 const int N ( numeric_cast<int>( A.columns() ) );
2765 const int lda( numeric_cast<int>( A.spacing() ) );
2767 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2768 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2788 template<
typename VT1
2792 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2793 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2795 using boost::numeric_cast;
2801 const int M ( numeric_cast<int>( A.rows() ) );
2802 const int N ( numeric_cast<int>( A.columns() ) );
2803 const int lda( numeric_cast<int>( A.spacing() ) );
2805 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2806 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2826 template<
typename VT1
2830 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2831 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2833 using boost::numeric_cast;
2842 const int M ( numeric_cast<int>( A.rows() ) );
2843 const int N ( numeric_cast<int>( A.columns() ) );
2844 const int lda( numeric_cast<int>( A.spacing() ) );
2845 const complex<float> alpha( -scalar );
2846 const complex<float> beta ( 1.0F, 0.0F );
2848 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2849 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2869 template<
typename VT1
2873 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2874 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2876 using boost::numeric_cast;
2885 const int M ( numeric_cast<int>( A.rows() ) );
2886 const int N ( numeric_cast<int>( A.columns() ) );
2887 const int lda( numeric_cast<int>( A.spacing() ) );
2888 const complex<double> alpha( -scalar );
2889 const complex<double> beta ( 1.0, 0.0 );
2891 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2892 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2913 template<
typename VT1 >
2914 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2987 template<
typename T1
2989 inline const typename DisableIf< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >::Type
2995 throw std::invalid_argument(
"Matrix and vector sizes do not match" );