22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
31 #include <boost/cast.hpp>
84 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
90 typedef typename MT::ResultType
MRT;
91 typedef typename VT::ResultType
VRT;
92 typedef typename MRT::ElementType
MET;
93 typedef typename VRT::ElementType
VET;
94 typedef typename MT::CompositeType
MCT;
95 typedef typename VT::CompositeType
VCT;
110 template<
typename T1,
typename T2,
typename T3 >
111 struct UseSinglePrecisionKernel {
112 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
126 template<
typename T1,
typename T2,
typename T3 >
127 struct UseDoublePrecisionKernel {
128 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
129 IsDouble<typename T1::ElementType>::value &&
130 IsDouble<typename T2::ElementType>::value &&
131 IsDouble<typename T3::ElementType>::value };
142 template<
typename T1,
typename T2,
typename T3 >
143 struct UseSinglePrecisionComplexKernel {
144 typedef complex<float> Type;
145 enum { value = IsSame<typename T1::ElementType,Type>::value &&
146 IsSame<typename T2::ElementType,Type>::value &&
147 IsSame<typename T3::ElementType,Type>::value };
158 template<
typename T1,
typename T2,
typename T3 >
159 struct UseDoublePrecisionComplexKernel {
160 typedef complex<double> Type;
161 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
162 IsSame<typename T1::ElementType,Type>::value &&
163 IsSame<typename T2::ElementType,Type>::value &&
164 IsSame<typename T3::ElementType,Type>::value };
174 template<
typename T1,
typename T2,
typename T3 >
175 struct UseDefaultKernel {
176 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
177 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
178 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
190 template<
typename T1,
typename T2,
typename T3 >
191 struct UseVectorizedDefaultKernel {
192 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
193 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
194 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
195 IntrinsicTrait<typename T1::ElementType>::addition &&
196 IntrinsicTrait<typename T1::ElementType>::multiplication };
226 enum { vectorizable = 0 };
255 if(
mat_.columns() != 0UL ) {
257 for(
size_t j=1UL; j<
end_; j+=2UL ) {
260 if( end_ <
mat_.columns() ) {
308 template<
typename T >
310 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
320 template<
typename T >
322 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
345 template<
typename VT1 >
352 if( rhs.mat_.rows() == 0UL ) {
355 else if( rhs.mat_.columns() == 0UL ) {
370 TDMatDVecMultExpr::selectDefaultAssignKernel( ~lhs, A, x );
372 TDMatDVecMultExpr::selectBlasAssignKernel( ~lhs, A, x );
391 template<
typename VT1
395 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
397 const size_t M( A.rows() );
398 const size_t N( A.columns() );
401 const size_t iend( M &
size_t(-2) );
403 for(
size_t i=0UL; i<M; ++i ) {
404 y[i] = x[0UL] * A(i,0UL);
406 for(
size_t j=1UL; j<N; ++j ) {
407 for(
size_t i=0UL; i<iend; i+=2UL ) {
408 y[i ] += x[j] * A(i ,j);
409 y[i+1UL] += x[j] * A(i+1UL,j);
412 y[iend] += x[j] * A(iend,j);
433 template<
typename VT1
436 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
437 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
439 typedef IntrinsicTrait<ElementType> IT;
441 const size_t M( A.spacing() );
442 const size_t N( A.columns() );
446 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
447 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
448 for(
size_t j=0UL; j<N; ++j ) {
450 xmm1 = xmm1 + A.get(i ,j) * x1;
451 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
452 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
453 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
454 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
455 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
456 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
457 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
459 store( &y[i ], xmm1 );
460 store( &y[i+IT::size ], xmm2 );
461 store( &y[i+IT::size*2UL], xmm3 );
462 store( &y[i+IT::size*3UL], xmm4 );
463 store( &y[i+IT::size*4UL], xmm5 );
464 store( &y[i+IT::size*5UL], xmm6 );
465 store( &y[i+IT::size*6UL], xmm7 );
466 store( &y[i+IT::size*7UL], xmm8 );
468 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
470 for(
size_t j=0UL; j<N; ++j ) {
472 xmm1 = xmm1 + A.get(i ,j) * x1;
473 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
474 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
475 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
477 store( &y[i ], xmm1 );
478 store( &y[i+IT::size ], xmm2 );
479 store( &y[i+IT::size*2UL], xmm3 );
480 store( &y[i+IT::size*3UL], xmm4 );
482 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
484 for(
size_t j=0UL; j<N; ++j ) {
486 xmm1 = xmm1 + A.get(i ,j) * x1;
487 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
488 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
490 store( &y[i ], xmm1 );
491 store( &y[i+IT::size ], xmm2 );
492 store( &y[i+IT::size*2UL], xmm3 );
494 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
496 for(
size_t j=0UL; j<N; ++j ) {
498 xmm1 = xmm1 + A.get(i ,j) * x1;
499 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
501 store( &y[i ], xmm1 );
502 store( &y[i+IT::size], xmm2 );
506 for(
size_t j=0UL; j<N; ++j ) {
507 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
509 store( &y[i], xmm1 );
529 template<
typename VT1
532 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
533 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
535 selectDefaultAssignKernel( y, A, x );
555 template<
typename VT1
558 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
559 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
561 using boost::numeric_cast;
567 const int M ( numeric_cast<int>( A.rows() ) );
568 const int N ( numeric_cast<int>( A.columns() ) );
569 const int lda( numeric_cast<int>( A.spacing() ) );
571 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
572 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
593 template<
typename VT1
596 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
597 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
599 using boost::numeric_cast;
605 const int M ( numeric_cast<int>( A.rows() ) );
606 const int N ( numeric_cast<int>( A.columns() ) );
607 const int lda( numeric_cast<int>( A.spacing() ) );
609 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
610 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
631 template<
typename VT1
634 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
635 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
637 using boost::numeric_cast;
646 const int M ( numeric_cast<int>( A.rows() ) );
647 const int N ( numeric_cast<int>( A.columns() ) );
648 const int lda( numeric_cast<int>( A.spacing() ) );
649 const complex<float> alpha( 1.0F, 0.0F );
650 const complex<float> beta ( 0.0F, 0.0F );
652 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
653 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
674 template<
typename VT1
677 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
678 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
680 using boost::numeric_cast;
689 const int M ( numeric_cast<int>( A.rows() ) );
690 const int N ( numeric_cast<int>( A.columns() ) );
691 const int lda( numeric_cast<int>( A.spacing() ) );
692 const complex<double> alpha( 1.0, 0.0 );
693 const complex<double> beta ( 0.0, 0.0 );
695 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
696 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
714 template<
typename VT1 >
744 template<
typename VT1 >
751 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
763 if( ( IsComputation<MT>::value && !evaluate ) ||
765 TDMatDVecMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x );
767 TDMatDVecMultExpr::selectBlasAddAssignKernel( ~lhs, A, x );
786 template<
typename VT1
789 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
790 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
792 const size_t M( A.rows() );
793 const size_t N( A.columns() );
796 const size_t iend( M &
size_t(-2) );
798 for(
size_t j=0UL; j<N; ++j ) {
799 for(
size_t i=0UL; i<iend; i+=2UL ) {
800 y[i ] += x[j] * A(i ,j);
801 y[i+1UL] += x[j] * A(i+1UL,j);
804 y[iend] += x[j] * A(iend,j);
825 template<
typename VT1
828 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
829 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
831 typedef IntrinsicTrait<ElementType> IT;
833 const size_t M( A.spacing() );
834 const size_t N( A.columns() );
838 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
847 for(
size_t j=0UL; j<N; ++j ) {
849 xmm1 = xmm1 + A.get(i ,j) * x1;
850 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
851 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
852 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
853 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
854 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
855 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
856 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
858 store( &y[i ], xmm1 );
859 store( &y[i+IT::size ], xmm2 );
860 store( &y[i+IT::size*2UL], xmm3 );
861 store( &y[i+IT::size*3UL], xmm4 );
862 store( &y[i+IT::size*4UL], xmm5 );
863 store( &y[i+IT::size*5UL], xmm6 );
864 store( &y[i+IT::size*6UL], xmm7 );
865 store( &y[i+IT::size*7UL], xmm8 );
867 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
872 for(
size_t j=0UL; j<N; ++j ) {
874 xmm1 = xmm1 + A.get(i ,j) * x1;
875 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
876 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
877 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
879 store( &y[i ], xmm1 );
880 store( &y[i+IT::size ], xmm2 );
881 store( &y[i+IT::size*2UL], xmm3 );
882 store( &y[i+IT::size*3UL], xmm4 );
884 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
888 for(
size_t j=0UL; j<N; ++j ) {
890 xmm1 = xmm1 + A.get(i ,j) * x1;
891 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
892 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
894 store( &y[i ], xmm1 );
895 store( &y[i+IT::size ], xmm2 );
896 store( &y[i+IT::size*2UL], xmm3 );
898 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
901 for(
size_t j=0UL; j<N; ++j ) {
903 xmm1 = xmm1 + A.get(i ,j) * x1;
904 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
906 store( &y[i ], xmm1 );
907 store( &y[i+IT::size], xmm2 );
911 for(
size_t j=0UL; j<N; ++j ) {
912 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
914 store( &y[i], xmm1 );
934 template<
typename VT1
937 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
938 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
940 selectDefaultAddAssignKernel( y, A, x );
960 template<
typename VT1
963 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
964 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
966 using boost::numeric_cast;
972 const int M ( numeric_cast<int>( A.rows() ) );
973 const int N ( numeric_cast<int>( A.columns() ) );
974 const int lda( numeric_cast<int>( A.spacing() ) );
976 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
977 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
998 template<
typename VT1
1001 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1002 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1004 using boost::numeric_cast;
1010 const int M ( numeric_cast<int>( A.rows() ) );
1011 const int N ( numeric_cast<int>( A.columns() ) );
1012 const int lda( numeric_cast<int>( A.spacing() ) );
1014 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1015 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1036 template<
typename VT1
1039 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1040 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1042 using boost::numeric_cast;
1051 const int M ( numeric_cast<int>( A.rows() ) );
1052 const int N ( numeric_cast<int>( A.columns() ) );
1053 const int lda( numeric_cast<int>( A.spacing() ) );
1054 const complex<float> alpha( 1.0F, 0.0F );
1055 const complex<float> beta ( 1.0F, 0.0F );
1057 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1058 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1079 template<
typename VT1
1082 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1083 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1085 using boost::numeric_cast;
1094 const int M ( numeric_cast<int>( A.rows() ) );
1095 const int N ( numeric_cast<int>( A.columns() ) );
1096 const int lda( numeric_cast<int>( A.spacing() ) );
1097 const complex<double> alpha( 1.0, 0.0 );
1098 const complex<double> beta ( 1.0, 0.0 );
1100 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1101 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1124 template<
typename VT1 >
1131 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1143 if( ( IsComputation<MT>::value && !evaluate ) ||
1145 TDMatDVecMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x );
1147 TDMatDVecMultExpr::selectBlasSubAssignKernel( ~lhs, A, x );
1166 template<
typename VT1
1169 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1170 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1172 const size_t M( A.rows() );
1173 const size_t N( A.columns() );
1176 const size_t iend( M &
size_t(-2) );
1178 for(
size_t j=0UL; j<N; ++j ) {
1179 for(
size_t i=0UL; i<iend; i+=2UL ) {
1180 y[i ] -= x[j] * A(i ,j);
1181 y[i+1UL] -= x[j] * A(i+1UL,j);
1184 y[iend] -= x[j] * A(iend,j);
1205 template<
typename VT1
1208 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1209 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1211 typedef IntrinsicTrait<ElementType> IT;
1213 const size_t M( A.spacing() );
1214 const size_t N( A.columns() );
1218 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1227 for(
size_t j=0UL; j<N; ++j ) {
1229 xmm1 = xmm1 - A.get(i ,j) * x1;
1230 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1231 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1232 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1233 xmm5 = xmm5 - A.get(i+IT::size*4UL,j) * x1;
1234 xmm6 = xmm6 - A.get(i+IT::size*5UL,j) * x1;
1235 xmm7 = xmm7 - A.get(i+IT::size*6UL,j) * x1;
1236 xmm8 = xmm8 - A.get(i+IT::size*7UL,j) * x1;
1238 store( &y[i ], xmm1 );
1239 store( &y[i+IT::size ], xmm2 );
1240 store( &y[i+IT::size*2UL], xmm3 );
1241 store( &y[i+IT::size*3UL], xmm4 );
1242 store( &y[i+IT::size*4UL], xmm5 );
1243 store( &y[i+IT::size*5UL], xmm6 );
1244 store( &y[i+IT::size*6UL], xmm7 );
1245 store( &y[i+IT::size*7UL], xmm8 );
1247 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1252 for(
size_t j=0UL; j<N; ++j ) {
1254 xmm1 = xmm1 - A.get(i ,j) * x1;
1255 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1256 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1257 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1259 store( &y[i ], xmm1 );
1260 store( &y[i+IT::size ], xmm2 );
1261 store( &y[i+IT::size*2UL], xmm3 );
1262 store( &y[i+IT::size*3UL], xmm4 );
1264 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
1268 for(
size_t j=0UL; j<N; ++j ) {
1270 xmm1 = xmm1 - A.get(i ,j) * x1;
1271 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1272 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1274 store( &y[i ], xmm1 );
1275 store( &y[i+IT::size ], xmm2 );
1276 store( &y[i+IT::size*2UL], xmm3 );
1278 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1281 for(
size_t j=0UL; j<N; ++j ) {
1283 xmm1 = xmm1 - A.get(i ,j) * x1;
1284 xmm2 = xmm2 - A.get(i+IT::size,j) * x1;
1286 store( &y[i ], xmm1 );
1287 store( &y[i+IT::size], xmm2 );
1291 for(
size_t j=0UL; j<N; ++j ) {
1292 xmm1 = xmm1 - A.get(i,j) *
set( x[j] );
1294 store( &y[i], xmm1 );
1314 template<
typename VT1
1317 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1318 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1320 selectDefaultSubAssignKernel( y, A, x );
1340 template<
typename VT1
1343 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1344 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1346 using boost::numeric_cast;
1352 const int M ( numeric_cast<int>( A.rows() ) );
1353 const int N ( numeric_cast<int>( A.columns() ) );
1354 const int lda( numeric_cast<int>( A.spacing() ) );
1356 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1357 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1378 template<
typename VT1
1381 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1382 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1384 using boost::numeric_cast;
1390 const int M ( numeric_cast<int>( A.rows() ) );
1391 const int N ( numeric_cast<int>( A.columns() ) );
1392 const int lda( numeric_cast<int>( A.spacing() ) );
1394 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1395 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1416 template<
typename VT1
1419 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1420 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1422 using boost::numeric_cast;
1431 const int M ( numeric_cast<int>( A.rows() ) );
1432 const int N ( numeric_cast<int>( A.columns() ) );
1433 const int lda( numeric_cast<int>( A.spacing() ) );
1434 const complex<float> alpha( -1.0F, 0.0F );
1435 const complex<float> beta ( 1.0F, 0.0F );
1437 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1438 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1459 template<
typename VT1
1462 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1463 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1465 using boost::numeric_cast;
1474 const int M ( numeric_cast<int>( A.rows() ) );
1475 const int N ( numeric_cast<int>( A.columns() ) );
1476 const int lda( numeric_cast<int>( A.spacing() ) );
1477 const complex<double> alpha( -1.0, 0.0 );
1478 const complex<double> beta ( 1.0, 0.0 );
1480 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1481 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1504 template<
typename VT1 >
1554 template<
typename MT
1558 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1559 ,
private Expression
1560 ,
private Computation
1564 typedef TDMatDVecMultExpr<MT,VT> MVM;
1565 typedef typename MVM::ResultType RES;
1566 typedef typename MT::ResultType
MRT;
1567 typedef typename VT::ResultType
VRT;
1568 typedef typename MRT::ElementType
MET;
1569 typedef typename VRT::ElementType
VET;
1570 typedef typename MT::CompositeType
MCT;
1571 typedef typename VT::CompositeType
VCT;
1576 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1577 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1585 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1586 struct UseSinglePrecisionKernel {
1587 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1588 IsFloat<typename T1::ElementType>::value &&
1589 IsFloat<typename T2::ElementType>::value &&
1590 IsFloat<typename T3::ElementType>::value &&
1591 !IsComplex<T4>::value };
1600 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1601 struct UseDoublePrecisionKernel {
1602 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1603 IsDouble<typename T1::ElementType>::value &&
1604 IsDouble<typename T2::ElementType>::value &&
1605 IsDouble<typename T3::ElementType>::value &&
1606 !IsComplex<T4>::value };
1615 template<
typename T1,
typename T2,
typename T3 >
1616 struct UseSinglePrecisionComplexKernel {
1617 typedef complex<float> Type;
1618 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1619 IsSame<typename T1::ElementType,Type>::value &&
1620 IsSame<typename T2::ElementType,Type>::value &&
1621 IsSame<typename T3::ElementType,Type>::value };
1630 template<
typename T1,
typename T2,
typename T3 >
1631 struct UseDoublePrecisionComplexKernel {
1632 typedef complex<double> Type;
1633 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1634 IsSame<typename T1::ElementType,Type>::value &&
1635 IsSame<typename T2::ElementType,Type>::value &&
1636 IsSame<typename T3::ElementType,Type>::value };
1644 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1645 struct UseDefaultKernel {
1646 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1647 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1648 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1649 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1658 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1659 struct UseVectorizedDefaultKernel {
1660 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1661 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1662 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1663 IsSame<typename T1::ElementType,T4>::value &&
1664 IntrinsicTrait<typename T1::ElementType>::addition &&
1665 IntrinsicTrait<typename T1::ElementType>::multiplication };
1671 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1672 typedef typename MultTrait<RES,ST>::Type
ResultType;
1674 typedef typename ResultType::ElementType
ElementType;
1675 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1680 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
1686 typedef typename SelectType< evaluate, const MRT, MCT >::Type
LT;
1689 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
RT;
1694 enum { vectorizable = 0 };
1703 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1717 return vector_[index] * scalar_;
1726 inline size_t size()
const {
1727 return vector_.size();
1757 template<
typename T >
1758 inline bool canAlias(
const T* alias )
const {
1759 return vector_.canAlias( alias );
1769 template<
typename T >
1770 inline bool isAliased(
const T* alias )
const {
1771 return vector_.isAliased( alias );
1793 template<
typename VT1 >
1794 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
1800 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
1801 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
1803 if( left.rows() == 0UL ) {
1806 else if( left.columns() == 0UL ) {
1819 if( ( IsComputation<MT>::value && !evaluate ) ||
1821 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, x, rhs.scalar_ );
1823 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, A, x, rhs.scalar_ );
1841 template<
typename VT1
1845 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1846 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1848 const size_t M( A.rows() );
1849 const size_t N( A.columns() );
1852 const size_t iend( M &
size_t(-2) );
1854 for(
size_t i=0UL; i<M; ++i ) {
1855 y[i] = x[0UL] * A(i,0UL);
1857 for(
size_t j=1UL; j<N; ++j ) {
1858 for(
size_t i=0UL; i<iend; i+=2UL ) {
1859 y[i ] += x[j] * A(i ,j);
1860 y[i+1UL] += x[j] * A(i+1UL,j);
1863 y[iend] += x[j] * A(iend,j);
1866 for(
size_t i=0UL; i<M; ++i ) {
1886 template<
typename VT1
1890 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1891 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1893 typedef IntrinsicTrait<ElementType> IT;
1895 const size_t M( A.spacing() );
1896 const size_t N( A.columns() );
1902 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1903 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1904 for(
size_t j=0UL; j<N; ++j ) {
1906 xmm1 = xmm1 + A.get(i ,j) * x1;
1907 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1908 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1909 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1910 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
1911 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
1912 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
1913 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
1915 store( &y[i ], xmm1*factor );
1916 store( &y[i+IT::size ], xmm2*factor );
1917 store( &y[i+IT::size*2UL], xmm3*factor );
1918 store( &y[i+IT::size*3UL], xmm4*factor );
1919 store( &y[i+IT::size*4UL], xmm5*factor );
1920 store( &y[i+IT::size*5UL], xmm6*factor );
1921 store( &y[i+IT::size*6UL], xmm7*factor );
1922 store( &y[i+IT::size*7UL], xmm8*factor );
1924 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1926 for(
size_t j=0UL; j<N; ++j ) {
1928 xmm1 = xmm1 + A.get(i ,j) * x1;
1929 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1930 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1931 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1933 store( &y[i ], xmm1*factor );
1934 store( &y[i+IT::size ], xmm2*factor );
1935 store( &y[i+IT::size*2UL], xmm3*factor );
1936 store( &y[i+IT::size*3UL], xmm4*factor );
1938 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
1940 for(
size_t j=0UL; j<N; ++j ) {
1942 xmm1 = xmm1 + A.get(i ,j) * x1;
1943 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1944 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1946 store( &y[i ], xmm1*factor );
1947 store( &y[i+IT::size ], xmm2*factor );
1948 store( &y[i+IT::size*2UL], xmm3*factor );
1950 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1952 for(
size_t j=0UL; j<N; ++j ) {
1954 xmm1 = xmm1 + A.get(i ,j) * x1;
1955 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
1957 store( &y[i ], xmm1*factor );
1958 store( &y[i+IT::size], xmm2*factor );
1962 for(
size_t j=0UL; j<N; ++j ) {
1964 xmm1 = xmm1 + A.get(i,j) * x1;
1966 store( &y[i], xmm1*factor );
1985 template<
typename VT1
1989 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1990 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1992 selectDefaultAssignKernel( y, A, x, scalar );
2011 template<
typename VT1
2015 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2016 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2018 using boost::numeric_cast;
2024 const int M ( numeric_cast<int>( A.rows() ) );
2025 const int N ( numeric_cast<int>( A.columns() ) );
2026 const int lda( numeric_cast<int>( A.spacing() ) );
2028 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2029 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2049 template<
typename VT1
2053 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2054 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2056 using boost::numeric_cast;
2062 const int M ( numeric_cast<int>( A.rows() ) );
2063 const int N ( numeric_cast<int>( A.columns() ) );
2064 const int lda( numeric_cast<int>( A.spacing() ) );
2066 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2067 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2087 template<
typename VT1
2091 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2092 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2094 using boost::numeric_cast;
2104 const int M ( numeric_cast<int>( A.rows() ) );
2105 const int N ( numeric_cast<int>( A.columns() ) );
2106 const int lda( numeric_cast<int>( A.spacing() ) );
2107 const complex<float> alpha( scalar );
2108 const complex<float> beta ( 0.0F, 0.0F );
2110 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2111 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2131 template<
typename VT1
2135 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2136 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2138 using boost::numeric_cast;
2148 const int M ( numeric_cast<int>( A.rows() ) );
2149 const int N ( numeric_cast<int>( A.columns() ) );
2150 const int lda( numeric_cast<int>( A.spacing() ) );
2151 const complex<double> alpha( scalar );
2152 const complex<double> beta ( 0.0, 0.0 );
2154 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2155 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2172 template<
typename VT1 >
2173 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2200 template<
typename VT1 >
2201 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2207 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2208 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2210 if( left.rows() == 0UL || left.columns() == 0UL ) {
2222 if( ( IsComputation<MT>::value && !evaluate ) ||
2224 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2226 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2244 template<
typename VT1
2248 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2249 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2251 y.addAssign( A * x * scalar );
2269 template<
typename VT1
2273 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2274 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2276 typedef IntrinsicTrait<ElementType> IT;
2278 const size_t M( A.spacing() );
2279 const size_t N( A.columns() );
2285 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2286 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2287 for(
size_t j=0UL; j<N; ++j ) {
2289 xmm1 = xmm1 + A.get(i ,j) * x1;
2290 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2291 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2292 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2293 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2294 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2295 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2296 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2298 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2299 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2300 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2301 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2302 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) + xmm5*factor );
2303 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) + xmm6*factor );
2304 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) + xmm7*factor );
2305 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) + xmm8*factor );
2307 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2309 for(
size_t j=0UL; j<N; ++j ) {
2311 xmm1 = xmm1 + A.get(i ,j) * x1;
2312 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2313 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2314 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2316 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2317 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2318 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2319 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2321 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
2323 for(
size_t j=0UL; j<N; ++j ) {
2325 xmm1 = xmm1 + A.get(i ,j) * x1;
2326 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2327 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2329 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2330 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2331 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2333 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2335 for(
size_t j=0UL; j<N; ++j ) {
2337 xmm1 = xmm1 + A.get(i ,j) * x1;
2338 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2340 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2341 store( &y[i+IT::size],
load( &y[i+IT::size] ) + xmm2*factor );
2345 for(
size_t j=0UL; j<N; ++j ) {
2346 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2348 store( &y[i],
load( &y[i] ) + xmm1*factor );
2367 template<
typename VT1
2371 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2372 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2374 selectDefaultAddAssignKernel( y, A, x, scalar );
2393 template<
typename VT1
2397 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2398 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2400 using boost::numeric_cast;
2406 const int M ( numeric_cast<int>( A.rows() ) );
2407 const int N ( numeric_cast<int>( A.columns() ) );
2408 const int lda( numeric_cast<int>( A.spacing() ) );
2410 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2411 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2431 template<
typename VT1
2435 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2436 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2438 using boost::numeric_cast;
2444 const int M ( numeric_cast<int>( A.rows() ) );
2445 const int N ( numeric_cast<int>( A.columns() ) );
2446 const int lda( numeric_cast<int>( A.spacing() ) );
2448 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2449 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2469 template<
typename VT1
2473 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2474 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2476 using boost::numeric_cast;
2486 const int M ( numeric_cast<int>( A.rows() ) );
2487 const int N ( numeric_cast<int>( A.columns() ) );
2488 const int lda( numeric_cast<int>( A.spacing() ) );
2489 const complex<float> alpha( scalar );
2490 const complex<float> beta ( 1.0F, 0.0F );
2492 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2493 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2513 template<
typename VT1
2517 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2518 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2520 using boost::numeric_cast;
2530 const int M ( numeric_cast<int>( A.rows() ) );
2531 const int N ( numeric_cast<int>( A.columns() ) );
2532 const int lda( numeric_cast<int>( A.spacing() ) );
2533 const complex<double> alpha( scalar );
2534 const complex<double> beta ( 1.0, 0.0 );
2536 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2537 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2558 template<
typename VT1 >
2559 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2565 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2566 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2568 if( left.rows() == 0UL || left.columns() == 0UL ) {
2580 if( ( IsComputation<MT>::value && !evaluate ) ||
2582 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2584 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2602 template<
typename VT1
2606 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2607 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2609 y.subAssign( A * x * scalar );
2627 template<
typename VT1
2631 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2632 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2634 typedef IntrinsicTrait<ElementType> IT;
2636 const size_t M( A.spacing() );
2637 const size_t N( A.columns() );
2643 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2644 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2645 for(
size_t j=0UL; j<N; ++j ) {
2647 xmm1 = xmm1 + A.get(i ,j) * x1;
2648 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2649 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2650 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2651 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2652 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2653 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2654 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2656 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2657 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2658 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2659 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2660 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) - xmm5*factor );
2661 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) - xmm6*factor );
2662 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) - xmm7*factor );
2663 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) - xmm8*factor );
2665 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2667 for(
size_t j=0UL; j<N; ++j ) {
2669 xmm1 = xmm1 + A.get(i ,j) * x1;
2670 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2671 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2672 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2674 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2675 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2676 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2677 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2679 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
2681 for(
size_t j=0UL; j<N; ++j ) {
2683 xmm1 = xmm1 + A.get(i ,j) * x1;
2684 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2685 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2687 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2688 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2689 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2691 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2693 for(
size_t j=0UL; j<N; ++j ) {
2695 xmm1 = xmm1 + A.get(i ,j) * x1;
2696 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2698 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2699 store( &y[i+IT::size],
load( &y[i+IT::size] ) - xmm2*factor );
2703 for(
size_t j=0UL; j<N; ++j ) {
2704 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2706 store( &y[i],
load( &y[i] ) - xmm1*factor );
2725 template<
typename VT1
2729 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2730 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2732 selectDefaultSubAssignKernel( y, A, x, scalar );
2751 template<
typename VT1
2755 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2756 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2758 using boost::numeric_cast;
2764 const int M ( numeric_cast<int>( A.rows() ) );
2765 const int N ( numeric_cast<int>( A.columns() ) );
2766 const int lda( numeric_cast<int>( A.spacing() ) );
2768 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2769 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2789 template<
typename VT1
2793 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2794 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2796 using boost::numeric_cast;
2802 const int M ( numeric_cast<int>( A.rows() ) );
2803 const int N ( numeric_cast<int>( A.columns() ) );
2804 const int lda( numeric_cast<int>( A.spacing() ) );
2806 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2807 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2827 template<
typename VT1
2831 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2832 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2834 using boost::numeric_cast;
2844 const int M ( numeric_cast<int>( A.rows() ) );
2845 const int N ( numeric_cast<int>( A.columns() ) );
2846 const int lda( numeric_cast<int>( A.spacing() ) );
2847 const complex<float> alpha( -scalar );
2848 const complex<float> beta ( 1.0F, 0.0F );
2850 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2851 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2871 template<
typename VT1
2875 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2876 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2878 using boost::numeric_cast;
2888 const int M ( numeric_cast<int>( A.rows() ) );
2889 const int N ( numeric_cast<int>( A.columns() ) );
2890 const int lda( numeric_cast<int>( A.spacing() ) );
2891 const complex<double> alpha( -scalar );
2892 const complex<double> beta ( 1.0, 0.0 );
2894 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2895 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2916 template<
typename VT1 >
2917 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2989 template<
typename T1
2991 inline const typename DisableIf< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >::Type
2997 throw std::invalid_argument(
"Matrix and vector sizes do not match" );