22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
31 #include <boost/cast.hpp>
90 typedef typename MT::ResultType
MRT;
91 typedef typename VT::ResultType
VRT;
92 typedef typename MRT::ElementType
MET;
93 typedef typename VRT::ElementType
VET;
94 typedef typename MT::CompositeType
MCT;
95 typedef typename VT::CompositeType
VCT;
110 template<
typename T1,
typename T2,
typename T3 >
111 struct UseSinglePrecisionKernel {
125 template<
typename T1,
typename T2,
typename T3 >
126 struct UseDoublePrecisionKernel {
127 enum { value = IsDouble<typename T1::ElementType>::value &&
128 IsDouble<typename T2::ElementType>::value &&
129 IsDouble<typename T3::ElementType>::value };
140 template<
typename T1,
typename T2,
typename T3 >
141 struct UseSinglePrecisionComplexKernel {
142 typedef complex<float> Type;
143 enum { value = IsSame<typename T1::ElementType,Type>::value &&
144 IsSame<typename T2::ElementType,Type>::value &&
145 IsSame<typename T3::ElementType,Type>::value };
156 template<
typename T1,
typename T2,
typename T3 >
157 struct UseDoublePrecisionComplexKernel {
158 typedef complex<double> Type;
159 enum { value = IsSame<typename T1::ElementType,Type>::value &&
160 IsSame<typename T2::ElementType,Type>::value &&
161 IsSame<typename T3::ElementType,Type>::value };
171 template<
typename T1,
typename T2,
typename T3 >
172 struct UseDefaultKernel {
173 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
174 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
175 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
176 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
187 template<
typename T1,
typename T2,
typename T3 >
188 struct UseVectorizedDefaultKernel {
189 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
190 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
191 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
192 IntrinsicTrait<typename T1::ElementType>::addition &&
193 IntrinsicTrait<typename T1::ElementType>::multiplication };
223 enum { vectorizable = 0 };
257 if(
mat_.columns() != 0UL ) {
259 for(
size_t j=1UL; j<
end_; j+=2UL ) {
262 if( end_ <
mat_.columns() ) {
310 template<
typename T >
337 template<
typename VT1 >
342 if( rhs.
mat_.rows() == 0UL ) {
345 else if( rhs.
mat_.columns() == 0UL ) {
360 TDMatDVecMultExpr::selectDefaultAssignKernel( ~lhs, A, x );
362 TDMatDVecMultExpr::selectBlasAssignKernel( ~lhs, A, x );
381 template<
typename VT1
385 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
387 const size_t M( A.rows() );
388 const size_t N( A.columns() );
391 const size_t iend( M &
size_t(-2) );
393 for(
size_t i=0UL; i<M; ++i ) {
394 y[i] = x[0UL] * A(i,0UL);
396 for(
size_t j=1UL; j<N; ++j ) {
397 for(
size_t i=0UL; i<iend; i+=2UL ) {
398 y[i ] += x[j] * A(i ,j);
399 y[i+1UL] += x[j] * A(i+1UL,j);
402 y[iend] += x[j] * A(iend,j);
423 template<
typename VT1
426 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
427 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
429 typedef IntrinsicTrait<ElementType> IT;
431 const size_t M( A.spacing() );
432 const size_t N( A.columns() );
436 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
437 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
438 for(
size_t j=0UL; j<N; ++j ) {
440 xmm1 = xmm1 + A.get(i ,j) * x1;
441 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
442 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
443 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
444 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
445 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
446 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
447 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
449 store( &y[i ], xmm1 );
450 store( &y[i+IT::size ], xmm2 );
451 store( &y[i+IT::size*2UL], xmm3 );
452 store( &y[i+IT::size*3UL], xmm4 );
453 store( &y[i+IT::size*4UL], xmm5 );
454 store( &y[i+IT::size*5UL], xmm6 );
455 store( &y[i+IT::size*6UL], xmm7 );
456 store( &y[i+IT::size*7UL], xmm8 );
458 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
460 for(
size_t j=0UL; j<N; ++j ) {
462 xmm1 = xmm1 + A.get(i ,j) * x1;
463 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
464 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
465 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
467 store( &y[i ], xmm1 );
468 store( &y[i+IT::size ], xmm2 );
469 store( &y[i+IT::size*2UL], xmm3 );
470 store( &y[i+IT::size*3UL], xmm4 );
472 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
474 for(
size_t j=0UL; j<N; ++j ) {
476 xmm1 = xmm1 + A.get(i ,j) * x1;
477 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
478 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
480 store( &y[i ], xmm1 );
481 store( &y[i+IT::size ], xmm2 );
482 store( &y[i+IT::size*2UL], xmm3 );
484 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
486 for(
size_t j=0UL; j<N; ++j ) {
488 xmm1 = xmm1 + A.get(i ,j) * x1;
489 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
491 store( &y[i ], xmm1 );
492 store( &y[i+IT::size], xmm2 );
496 for(
size_t j=0UL; j<N; ++j ) {
497 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
499 store( &y[i], xmm1 );
519 template<
typename VT1
522 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
523 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
525 selectDefaultAssignKernel( y, A, x );
545 template<
typename VT1
548 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
549 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
551 using boost::numeric_cast;
557 const int M ( numeric_cast<int>( A.rows() ) );
558 const int N ( numeric_cast<int>( A.columns() ) );
559 const int lda( numeric_cast<int>( A.spacing() ) );
561 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
562 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
583 template<
typename VT1
586 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
587 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
589 using boost::numeric_cast;
595 const int M ( numeric_cast<int>( A.rows() ) );
596 const int N ( numeric_cast<int>( A.columns() ) );
597 const int lda( numeric_cast<int>( A.spacing() ) );
599 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
600 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
621 template<
typename VT1
624 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
625 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
627 using boost::numeric_cast;
636 const int M ( numeric_cast<int>( A.rows() ) );
637 const int N ( numeric_cast<int>( A.columns() ) );
638 const int lda( numeric_cast<int>( A.spacing() ) );
639 const complex<float> alpha( 1.0F, 0.0F );
640 const complex<float> beta ( 0.0F, 0.0F );
642 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
643 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
664 template<
typename VT1
667 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
668 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
670 using boost::numeric_cast;
679 const int M ( numeric_cast<int>( A.rows() ) );
680 const int N ( numeric_cast<int>( A.columns() ) );
681 const int lda( numeric_cast<int>( A.spacing() ) );
682 const complex<double> alpha( 1.0, 0.0 );
683 const complex<double> beta ( 0.0, 0.0 );
685 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
686 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
704 template<
typename VT1 >
732 template<
typename VT1 >
737 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
749 if( ( IsComputation<MT>::value && !evaluate ) ||
751 TDMatDVecMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x );
753 TDMatDVecMultExpr::selectBlasAddAssignKernel( ~lhs, A, x );
772 template<
typename VT1
775 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
776 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
778 const size_t M( A.rows() );
779 const size_t N( A.columns() );
782 const size_t iend( M &
size_t(-2) );
784 for(
size_t j=0UL; j<N; ++j ) {
785 for(
size_t i=0UL; i<iend; i+=2UL ) {
786 y[i ] += x[j] * A(i ,j);
787 y[i+1UL] += x[j] * A(i+1UL,j);
790 y[iend] += x[j] * A(iend,j);
811 template<
typename VT1
814 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
815 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
817 typedef IntrinsicTrait<ElementType> IT;
819 const size_t M( A.spacing() );
820 const size_t N( A.columns() );
824 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
833 for(
size_t j=0UL; j<N; ++j ) {
835 xmm1 = xmm1 + A.get(i ,j) * x1;
836 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
837 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
838 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
839 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
840 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
841 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
842 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
844 store( &y[i ], xmm1 );
845 store( &y[i+IT::size ], xmm2 );
846 store( &y[i+IT::size*2UL], xmm3 );
847 store( &y[i+IT::size*3UL], xmm4 );
848 store( &y[i+IT::size*4UL], xmm5 );
849 store( &y[i+IT::size*5UL], xmm6 );
850 store( &y[i+IT::size*6UL], xmm7 );
851 store( &y[i+IT::size*7UL], xmm8 );
853 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
858 for(
size_t j=0UL; j<N; ++j ) {
860 xmm1 = xmm1 + A.get(i ,j) * x1;
861 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
862 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
863 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
865 store( &y[i ], xmm1 );
866 store( &y[i+IT::size ], xmm2 );
867 store( &y[i+IT::size*2UL], xmm3 );
868 store( &y[i+IT::size*3UL], xmm4 );
870 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
874 for(
size_t j=0UL; j<N; ++j ) {
876 xmm1 = xmm1 + A.get(i ,j) * x1;
877 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
878 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
880 store( &y[i ], xmm1 );
881 store( &y[i+IT::size ], xmm2 );
882 store( &y[i+IT::size*2UL], xmm3 );
884 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
887 for(
size_t j=0UL; j<N; ++j ) {
889 xmm1 = xmm1 + A.get(i ,j) * x1;
890 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
892 store( &y[i ], xmm1 );
893 store( &y[i+IT::size], xmm2 );
897 for(
size_t j=0UL; j<N; ++j ) {
898 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
900 store( &y[i], xmm1 );
920 template<
typename VT1
923 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
924 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
926 selectDefaultAddAssignKernel( y, A, x );
946 template<
typename VT1
949 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
950 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
952 using boost::numeric_cast;
958 const int M ( numeric_cast<int>( A.rows() ) );
959 const int N ( numeric_cast<int>( A.columns() ) );
960 const int lda( numeric_cast<int>( A.spacing() ) );
962 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
963 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
984 template<
typename VT1
987 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
988 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
990 using boost::numeric_cast;
996 const int M ( numeric_cast<int>( A.rows() ) );
997 const int N ( numeric_cast<int>( A.columns() ) );
998 const int lda( numeric_cast<int>( A.spacing() ) );
1000 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1001 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1022 template<
typename VT1
1025 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1026 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1028 using boost::numeric_cast;
1037 const int M ( numeric_cast<int>( A.rows() ) );
1038 const int N ( numeric_cast<int>( A.columns() ) );
1039 const int lda( numeric_cast<int>( A.spacing() ) );
1040 const complex<float> alpha( 1.0F, 0.0F );
1041 const complex<float> beta ( 1.0F, 0.0F );
1043 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1044 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1065 template<
typename VT1
1068 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1069 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1071 using boost::numeric_cast;
1080 const int M ( numeric_cast<int>( A.rows() ) );
1081 const int N ( numeric_cast<int>( A.columns() ) );
1082 const int lda( numeric_cast<int>( A.spacing() ) );
1083 const complex<double> alpha( 1.0, 0.0 );
1084 const complex<double> beta ( 1.0, 0.0 );
1086 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1087 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1110 template<
typename VT1 >
1115 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1127 if( ( IsComputation<MT>::value && !evaluate ) ||
1129 TDMatDVecMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x );
1131 TDMatDVecMultExpr::selectBlasSubAssignKernel( ~lhs, A, x );
1150 template<
typename VT1
1153 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1154 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1156 const size_t M( A.rows() );
1157 const size_t N( A.columns() );
1160 const size_t iend( M &
size_t(-2) );
1162 for(
size_t j=0UL; j<N; ++j ) {
1163 for(
size_t i=0UL; i<iend; i+=2UL ) {
1164 y[i ] -= x[j] * A(i ,j);
1165 y[i+1UL] -= x[j] * A(i+1UL,j);
1168 y[iend] -= x[j] * A(iend,j);
1189 template<
typename VT1
1192 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1193 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1195 typedef IntrinsicTrait<ElementType> IT;
1197 const size_t M( A.spacing() );
1198 const size_t N( A.columns() );
1202 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1211 for(
size_t j=0UL; j<N; ++j ) {
1213 xmm1 = xmm1 - A.get(i ,j) * x1;
1214 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1215 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1216 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1217 xmm5 = xmm5 - A.get(i+IT::size*4UL,j) * x1;
1218 xmm6 = xmm6 - A.get(i+IT::size*5UL,j) * x1;
1219 xmm7 = xmm7 - A.get(i+IT::size*6UL,j) * x1;
1220 xmm8 = xmm8 - A.get(i+IT::size*7UL,j) * x1;
1222 store( &y[i ], xmm1 );
1223 store( &y[i+IT::size ], xmm2 );
1224 store( &y[i+IT::size*2UL], xmm3 );
1225 store( &y[i+IT::size*3UL], xmm4 );
1226 store( &y[i+IT::size*4UL], xmm5 );
1227 store( &y[i+IT::size*5UL], xmm6 );
1228 store( &y[i+IT::size*6UL], xmm7 );
1229 store( &y[i+IT::size*7UL], xmm8 );
1231 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1236 for(
size_t j=0UL; j<N; ++j ) {
1238 xmm1 = xmm1 - A.get(i ,j) * x1;
1239 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1240 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1241 xmm4 = xmm4 - A.get(i+IT::size*3UL,j) * x1;
1243 store( &y[i ], xmm1 );
1244 store( &y[i+IT::size ], xmm2 );
1245 store( &y[i+IT::size*2UL], xmm3 );
1246 store( &y[i+IT::size*3UL], xmm4 );
1248 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
1252 for(
size_t j=0UL; j<N; ++j ) {
1254 xmm1 = xmm1 - A.get(i ,j) * x1;
1255 xmm2 = xmm2 - A.get(i+IT::size ,j) * x1;
1256 xmm3 = xmm3 - A.get(i+IT::size*2UL,j) * x1;
1258 store( &y[i ], xmm1 );
1259 store( &y[i+IT::size ], xmm2 );
1260 store( &y[i+IT::size*2UL], xmm3 );
1262 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1265 for(
size_t j=0UL; j<N; ++j ) {
1267 xmm1 = xmm1 - A.get(i ,j) * x1;
1268 xmm2 = xmm2 - A.get(i+IT::size,j) * x1;
1270 store( &y[i ], xmm1 );
1271 store( &y[i+IT::size], xmm2 );
1275 for(
size_t j=0UL; j<N; ++j ) {
1276 xmm1 = xmm1 - A.get(i,j) *
set( x[j] );
1278 store( &y[i], xmm1 );
1298 template<
typename VT1
1301 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1302 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1304 selectDefaultSubAssignKernel( y, A, x );
1324 template<
typename VT1
1327 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1328 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1330 using boost::numeric_cast;
1336 const int M ( numeric_cast<int>( A.rows() ) );
1337 const int N ( numeric_cast<int>( A.columns() ) );
1338 const int lda( numeric_cast<int>( A.spacing() ) );
1340 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1341 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1362 template<
typename VT1
1365 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1366 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1368 using boost::numeric_cast;
1374 const int M ( numeric_cast<int>( A.rows() ) );
1375 const int N ( numeric_cast<int>( A.columns() ) );
1376 const int lda( numeric_cast<int>( A.spacing() ) );
1378 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1379 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1400 template<
typename VT1
1403 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1404 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1406 using boost::numeric_cast;
1415 const int M ( numeric_cast<int>( A.rows() ) );
1416 const int N ( numeric_cast<int>( A.columns() ) );
1417 const int lda( numeric_cast<int>( A.spacing() ) );
1418 const complex<float> alpha( -1.0F, 0.0F );
1419 const complex<float> beta ( 1.0F, 0.0F );
1421 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1422 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1443 template<
typename VT1
1446 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1447 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1449 using boost::numeric_cast;
1458 const int M ( numeric_cast<int>( A.rows() ) );
1459 const int N ( numeric_cast<int>( A.columns() ) );
1460 const int lda( numeric_cast<int>( A.spacing() ) );
1461 const complex<double> alpha( -1.0, 0.0 );
1462 const complex<double> beta ( 1.0, 0.0 );
1464 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1465 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1488 template<
typename VT1 >
1536 template<
typename MT
1540 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1541 ,
private Expression
1542 ,
private Computation
1547 typedef typename MVM::ResultType RES;
1548 typedef typename MT::ResultType
MRT;
1549 typedef typename VT::ResultType
VRT;
1550 typedef typename MRT::ElementType
MET;
1551 typedef typename VRT::ElementType
VET;
1552 typedef typename MT::CompositeType
MCT;
1553 typedef typename VT::CompositeType
VCT;
1558 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1559 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1567 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1568 struct UseSinglePrecisionKernel {
1569 enum { value = IsFloat<typename T1::ElementType>::value &&
1570 IsFloat<typename T2::ElementType>::value &&
1571 IsFloat<typename T3::ElementType>::value &&
1572 !IsComplex<T4>::value };
1581 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1582 struct UseDoublePrecisionKernel {
1583 enum { value = IsDouble<typename T1::ElementType>::value &&
1584 IsDouble<typename T2::ElementType>::value &&
1585 IsDouble<typename T3::ElementType>::value &&
1586 !IsComplex<T4>::value };
1595 template<
typename T1,
typename T2,
typename T3 >
1596 struct UseSinglePrecisionComplexKernel {
1597 typedef complex<float> Type;
1598 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1599 IsSame<typename T2::ElementType,Type>::value &&
1600 IsSame<typename T3::ElementType,Type>::value };
1609 template<
typename T1,
typename T2,
typename T3 >
1610 struct UseDoublePrecisionComplexKernel {
1611 typedef complex<double> Type;
1612 enum { value = IsSame<typename T1::ElementType,Type>::value &&
1613 IsSame<typename T2::ElementType,Type>::value &&
1614 IsSame<typename T3::ElementType,Type>::value };
1622 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1623 struct UseDefaultKernel {
1624 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1625 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1626 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1627 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1636 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1637 struct UseVectorizedDefaultKernel {
1638 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1639 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1640 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1641 IsSame<typename T1::ElementType,T4>::value &&
1642 IntrinsicTrait<typename T1::ElementType>::addition &&
1643 IntrinsicTrait<typename T1::ElementType>::multiplication };
1649 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1650 typedef typename MultTrait<RES,ST>::Type
ResultType;
1652 typedef typename ResultType::ElementType
ElementType;
1653 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1664 typedef typename SelectType< evaluate, const MRT, MCT >::Type
LT;
1667 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
RT;
1672 enum { vectorizable = 0 };
1675 enum { canAlias = CanAlias<MVM>::value };
1684 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1698 return vector_[index] * scalar_;
1707 inline size_t size()
const {
1708 return vector_.size();
1738 template<
typename T >
1739 inline bool isAliased(
const T* alias )
const {
1740 return CanAlias<MVM>::value && vector_.isAliased( alias );
1762 template<
typename VT1 >
1763 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
1767 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
1768 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
1770 if( left.rows() == 0UL ) {
1773 else if( left.columns() == 0UL ) {
1786 if( ( IsComputation<MT>::value && !evaluate ) ||
1788 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, x, rhs.scalar_ );
1790 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, A, x, rhs.scalar_ );
1808 template<
typename VT1
1812 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1813 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1815 const size_t M( A.rows() );
1816 const size_t N( A.columns() );
1819 const size_t iend( M &
size_t(-2) );
1821 for(
size_t i=0UL; i<M; ++i ) {
1822 y[i] = x[0UL] * A(i,0UL);
1824 for(
size_t j=1UL; j<N; ++j ) {
1825 for(
size_t i=0UL; i<iend; i+=2UL ) {
1826 y[i ] += x[j] * A(i ,j);
1827 y[i+1UL] += x[j] * A(i+1UL,j);
1830 y[iend] += x[j] * A(iend,j);
1833 for(
size_t i=0UL; i<M; ++i ) {
1853 template<
typename VT1
1857 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1858 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1860 typedef IntrinsicTrait<ElementType> IT;
1862 const size_t M( A.spacing() );
1863 const size_t N( A.columns() );
1869 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
1870 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1871 for(
size_t j=0UL; j<N; ++j ) {
1873 xmm1 = xmm1 + A.get(i ,j) * x1;
1874 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1875 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1876 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1877 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
1878 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
1879 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
1880 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
1882 store( &y[i ], xmm1*factor );
1883 store( &y[i+IT::size ], xmm2*factor );
1884 store( &y[i+IT::size*2UL], xmm3*factor );
1885 store( &y[i+IT::size*3UL], xmm4*factor );
1886 store( &y[i+IT::size*4UL], xmm5*factor );
1887 store( &y[i+IT::size*5UL], xmm6*factor );
1888 store( &y[i+IT::size*6UL], xmm7*factor );
1889 store( &y[i+IT::size*7UL], xmm8*factor );
1891 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
1893 for(
size_t j=0UL; j<N; ++j ) {
1895 xmm1 = xmm1 + A.get(i ,j) * x1;
1896 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1897 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1898 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
1900 store( &y[i ], xmm1*factor );
1901 store( &y[i+IT::size ], xmm2*factor );
1902 store( &y[i+IT::size*2UL], xmm3*factor );
1903 store( &y[i+IT::size*3UL], xmm4*factor );
1905 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
1907 for(
size_t j=0UL; j<N; ++j ) {
1909 xmm1 = xmm1 + A.get(i ,j) * x1;
1910 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
1911 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
1913 store( &y[i ], xmm1*factor );
1914 store( &y[i+IT::size ], xmm2*factor );
1915 store( &y[i+IT::size*2UL], xmm3*factor );
1917 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
1919 for(
size_t j=0UL; j<N; ++j ) {
1921 xmm1 = xmm1 + A.get(i ,j) * x1;
1922 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
1924 store( &y[i ], xmm1*factor );
1925 store( &y[i+IT::size], xmm2*factor );
1929 for(
size_t j=0UL; j<N; ++j ) {
1931 xmm1 = xmm1 + A.get(i,j) * x1;
1933 store( &y[i], xmm1*factor );
1952 template<
typename VT1
1956 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1957 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1959 selectDefaultAssignKernel( y, A, x, scalar );
1978 template<
typename VT1
1982 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
1983 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1985 using boost::numeric_cast;
1991 const int M ( numeric_cast<int>( A.rows() ) );
1992 const int N ( numeric_cast<int>( A.columns() ) );
1993 const int lda( numeric_cast<int>( A.spacing() ) );
1995 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
1996 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2016 template<
typename VT1
2020 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2021 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2023 using boost::numeric_cast;
2029 const int M ( numeric_cast<int>( A.rows() ) );
2030 const int N ( numeric_cast<int>( A.columns() ) );
2031 const int lda( numeric_cast<int>( A.spacing() ) );
2033 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2034 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2054 template<
typename VT1
2058 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2059 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2061 using boost::numeric_cast;
2071 const int M ( numeric_cast<int>( A.rows() ) );
2072 const int N ( numeric_cast<int>( A.columns() ) );
2073 const int lda( numeric_cast<int>( A.spacing() ) );
2074 const complex<float> alpha( scalar );
2075 const complex<float> beta ( 0.0F, 0.0F );
2077 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2078 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2098 template<
typename VT1
2102 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2103 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2105 using boost::numeric_cast;
2115 const int M ( numeric_cast<int>( A.rows() ) );
2116 const int N ( numeric_cast<int>( A.columns() ) );
2117 const int lda( numeric_cast<int>( A.spacing() ) );
2118 const complex<double> alpha( scalar );
2119 const complex<double> beta ( 0.0, 0.0 );
2121 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2122 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2139 template<
typename VT1 >
2165 template<
typename VT1 >
2166 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2170 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2171 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2173 if( left.rows() == 0UL || left.columns() == 0UL ) {
2185 if( ( IsComputation<MT>::value && !evaluate ) ||
2187 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2189 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2207 template<
typename VT1
2211 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2212 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2214 y.addAssign( A * x * scalar );
2232 template<
typename VT1
2236 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2237 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2239 typedef IntrinsicTrait<ElementType> IT;
2241 const size_t M( A.spacing() );
2242 const size_t N( A.columns() );
2248 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2249 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2250 for(
size_t j=0UL; j<N; ++j ) {
2252 xmm1 = xmm1 + A.get(i ,j) * x1;
2253 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2254 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2255 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2256 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2257 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2258 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2259 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2261 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2262 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2263 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2264 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2265 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) + xmm5*factor );
2266 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) + xmm6*factor );
2267 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) + xmm7*factor );
2268 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) + xmm8*factor );
2270 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2272 for(
size_t j=0UL; j<N; ++j ) {
2274 xmm1 = xmm1 + A.get(i ,j) * x1;
2275 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2276 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2277 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2279 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2280 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2281 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2282 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) + xmm4*factor );
2284 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
2286 for(
size_t j=0UL; j<N; ++j ) {
2288 xmm1 = xmm1 + A.get(i ,j) * x1;
2289 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2290 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2292 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2293 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) + xmm2*factor );
2294 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) + xmm3*factor );
2296 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2298 for(
size_t j=0UL; j<N; ++j ) {
2300 xmm1 = xmm1 + A.get(i ,j) * x1;
2301 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2303 store( &y[i ],
load( &y[i ] ) + xmm1*factor );
2304 store( &y[i+IT::size],
load( &y[i+IT::size] ) + xmm2*factor );
2308 for(
size_t j=0UL; j<N; ++j ) {
2309 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2311 store( &y[i],
load( &y[i] ) + xmm1*factor );
2330 template<
typename VT1
2334 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2335 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2337 selectDefaultAddAssignKernel( y, A, x, scalar );
2356 template<
typename VT1
2360 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2361 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2363 using boost::numeric_cast;
2369 const int M ( numeric_cast<int>( A.rows() ) );
2370 const int N ( numeric_cast<int>( A.columns() ) );
2371 const int lda( numeric_cast<int>( A.spacing() ) );
2373 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2374 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2394 template<
typename VT1
2398 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2399 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2401 using boost::numeric_cast;
2407 const int M ( numeric_cast<int>( A.rows() ) );
2408 const int N ( numeric_cast<int>( A.columns() ) );
2409 const int lda( numeric_cast<int>( A.spacing() ) );
2411 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2412 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2432 template<
typename VT1
2436 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2437 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2439 using boost::numeric_cast;
2449 const int M ( numeric_cast<int>( A.rows() ) );
2450 const int N ( numeric_cast<int>( A.columns() ) );
2451 const int lda( numeric_cast<int>( A.spacing() ) );
2452 const complex<float> alpha( scalar );
2453 const complex<float> beta ( 1.0F, 0.0F );
2455 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2456 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2476 template<
typename VT1
2480 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2481 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2483 using boost::numeric_cast;
2493 const int M ( numeric_cast<int>( A.rows() ) );
2494 const int N ( numeric_cast<int>( A.columns() ) );
2495 const int lda( numeric_cast<int>( A.spacing() ) );
2496 const complex<double> alpha( scalar );
2497 const complex<double> beta ( 1.0, 0.0 );
2499 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2500 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2521 template<
typename VT1 >
2522 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2526 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2527 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2529 if( left.rows() == 0UL || left.columns() == 0UL ) {
2541 if( ( IsComputation<MT>::value && !evaluate ) ||
2543 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2545 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2563 template<
typename VT1
2567 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2568 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2570 y.subAssign( A * x * scalar );
2588 template<
typename VT1
2592 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2593 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2595 typedef IntrinsicTrait<ElementType> IT;
2597 const size_t M( A.spacing() );
2598 const size_t N( A.columns() );
2604 for( ; (i+IT::size*8UL) <= M; i+=IT::size*8UL ) {
2605 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2606 for(
size_t j=0UL; j<N; ++j ) {
2608 xmm1 = xmm1 + A.get(i ,j) * x1;
2609 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2610 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2611 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2612 xmm5 = xmm5 + A.get(i+IT::size*4UL,j) * x1;
2613 xmm6 = xmm6 + A.get(i+IT::size*5UL,j) * x1;
2614 xmm7 = xmm7 + A.get(i+IT::size*6UL,j) * x1;
2615 xmm8 = xmm8 + A.get(i+IT::size*7UL,j) * x1;
2617 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2618 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2619 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2620 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2621 store( &y[i+IT::size*4UL],
load( &y[i+IT::size*4UL] ) - xmm5*factor );
2622 store( &y[i+IT::size*5UL],
load( &y[i+IT::size*5UL] ) - xmm6*factor );
2623 store( &y[i+IT::size*6UL],
load( &y[i+IT::size*6UL] ) - xmm7*factor );
2624 store( &y[i+IT::size*7UL],
load( &y[i+IT::size*7UL] ) - xmm8*factor );
2626 for( ; (i+IT::size*4UL) <= M; i+=IT::size*4UL ) {
2628 for(
size_t j=0UL; j<N; ++j ) {
2630 xmm1 = xmm1 + A.get(i ,j) * x1;
2631 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2632 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2633 xmm4 = xmm4 + A.get(i+IT::size*3UL,j) * x1;
2635 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2636 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2637 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2638 store( &y[i+IT::size*3UL],
load( &y[i+IT::size*3UL] ) - xmm4*factor );
2640 for( ; (i+IT::size*3UL) <= M; i+=IT::size*3UL ) {
2642 for(
size_t j=0UL; j<N; ++j ) {
2644 xmm1 = xmm1 + A.get(i ,j) * x1;
2645 xmm2 = xmm2 + A.get(i+IT::size ,j) * x1;
2646 xmm3 = xmm3 + A.get(i+IT::size*2UL,j) * x1;
2648 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2649 store( &y[i+IT::size ],
load( &y[i+IT::size ] ) - xmm2*factor );
2650 store( &y[i+IT::size*2UL],
load( &y[i+IT::size*2UL] ) - xmm3*factor );
2652 for( ; (i+IT::size*2UL) <= M; i+=IT::size*2UL ) {
2654 for(
size_t j=0UL; j<N; ++j ) {
2656 xmm1 = xmm1 + A.get(i ,j) * x1;
2657 xmm2 = xmm2 + A.get(i+IT::size,j) * x1;
2659 store( &y[i ],
load( &y[i ] ) - xmm1*factor );
2660 store( &y[i+IT::size],
load( &y[i+IT::size] ) - xmm2*factor );
2664 for(
size_t j=0UL; j<N; ++j ) {
2665 xmm1 = xmm1 + A.get(i,j) *
set( x[j] );
2667 store( &y[i],
load( &y[i] ) - xmm1*factor );
2686 template<
typename VT1
2690 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2691 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2693 selectDefaultSubAssignKernel( y, A, x, scalar );
2712 template<
typename VT1
2716 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2717 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2719 using boost::numeric_cast;
2725 const int M ( numeric_cast<int>( A.rows() ) );
2726 const int N ( numeric_cast<int>( A.columns() ) );
2727 const int lda( numeric_cast<int>( A.spacing() ) );
2729 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2730 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2750 template<
typename VT1
2754 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2755 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2757 using boost::numeric_cast;
2763 const int M ( numeric_cast<int>( A.rows() ) );
2764 const int N ( numeric_cast<int>( A.columns() ) );
2765 const int lda( numeric_cast<int>( A.spacing() ) );
2767 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2768 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2788 template<
typename VT1
2792 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2793 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2795 using boost::numeric_cast;
2805 const int M ( numeric_cast<int>( A.rows() ) );
2806 const int N ( numeric_cast<int>( A.columns() ) );
2807 const int lda( numeric_cast<int>( A.spacing() ) );
2808 const complex<float> alpha( -scalar );
2809 const complex<float> beta ( 1.0F, 0.0F );
2811 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2812 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2832 template<
typename VT1
2836 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2837 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2839 using boost::numeric_cast;
2849 const int M ( numeric_cast<int>( A.rows() ) );
2850 const int N ( numeric_cast<int>( A.columns() ) );
2851 const int lda( numeric_cast<int>( A.spacing() ) );
2852 const complex<double> alpha( -scalar );
2853 const complex<double> beta ( 1.0, 0.0 );
2855 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2856 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2877 template<
typename VT1 >
2878 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2948 template<
typename T1
2954 throw std::invalid_argument(
"Matrix and vector sizes do not match" );