22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDVECDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
88 class TDVecDMatMultExpr :
public DenseVector< TDVecDMatMultExpr<VT,MT>, true >
89 ,
private TVecMatMultExpr
94 typedef typename VT::ResultType
VRT;
95 typedef typename MT::ResultType
MRT;
96 typedef typename VRT::ElementType
VET;
97 typedef typename MRT::ElementType
MET;
98 typedef typename VT::CompositeType
VCT;
99 typedef typename MT::CompositeType
MCT;
114 template<
typename T1,
typename T2,
typename T3 >
115 struct UseSinglePrecisionKernel {
116 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
130 template<
typename T1,
typename T2,
typename T3 >
131 struct UseDoublePrecisionKernel {
132 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
133 IsDouble<typename T1::ElementType>::value &&
134 IsDouble<typename T2::ElementType>::value &&
135 IsDouble<typename T3::ElementType>::value };
146 template<
typename T1,
typename T2,
typename T3 >
147 struct UseSinglePrecisionComplexKernel {
148 typedef complex<float> Type;
149 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
150 IsSame<typename T1::ElementType,Type>::value &&
151 IsSame<typename T2::ElementType,Type>::value &&
152 IsSame<typename T3::ElementType,Type>::value };
163 template<
typename T1,
typename T2,
typename T3 >
164 struct UseDoublePrecisionComplexKernel {
165 typedef complex<double> Type;
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
167 IsSame<typename T1::ElementType,Type>::value &&
168 IsSame<typename T2::ElementType,Type>::value &&
169 IsSame<typename T3::ElementType,Type>::value };
179 template<
typename T1,
typename T2,
typename T3 >
180 struct UseDefaultKernel {
181 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
182 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
183 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
184 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
195 template<
typename T1,
typename T2,
typename T3 >
196 struct UseVectorizedDefaultKernel {
197 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
198 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
199 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
200 IntrinsicTrait<typename T1::ElementType>::addition &&
201 IntrinsicTrait<typename T1::ElementType>::multiplication };
231 enum { vectorizable = 0 };
260 if(
mat_.rows() != 0UL ) {
262 for(
size_t j=1UL; j<
end_; j+=2UL ) {
265 if( end_ < mat_.rows() ) {
283 return mat_.columns();
313 template<
typename T >
315 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
325 template<
typename T >
327 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
351 template<
typename VT1 >
358 if( rhs.mat_.rows() == 0UL ) {
362 else if( rhs.mat_.columns() == 0UL ) {
376 TDVecDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
378 TDVecDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
397 template<
typename VT1
401 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
403 const size_t M( A.rows() );
404 const size_t N( A.columns() );
407 const size_t jend( N &
size_t(-2) );
409 for(
size_t j=0UL; j<N; ++j ) {
410 y[j] = x[0UL] * A(0UL,j);
412 for(
size_t i=1UL; i<M; ++i ) {
413 for(
size_t j=0UL; j<jend; j+=2UL ) {
414 y[j ] += x[i] * A(i,j );
415 y[j+1UL] += x[i] * A(i,j+1UL);
418 y[jend] += x[i] * A(i,jend);
439 template<
typename VT1
442 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
443 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
445 typedef IntrinsicTrait<ElementType> IT;
447 const size_t M( A.rows() );
448 const size_t N( A.columns() );
452 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
453 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
454 for(
size_t i=0UL; i<M; ++i ) {
456 xmm1 = xmm1 + x1 * A.get(i,j );
457 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
458 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
459 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
460 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
461 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
462 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
463 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
465 store( &y[j ], xmm1 );
466 store( &y[j+IT::size ], xmm2 );
467 store( &y[j+IT::size*2UL], xmm3 );
468 store( &y[j+IT::size*3UL], xmm4 );
469 store( &y[j+IT::size*4UL], xmm5 );
470 store( &y[j+IT::size*5UL], xmm6 );
471 store( &y[j+IT::size*6UL], xmm7 );
472 store( &y[j+IT::size*7UL], xmm8 );
474 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
476 for(
size_t i=0UL; i<M; ++i ) {
478 xmm1 = xmm1 + x1 * A.get(i,j );
479 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
480 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
481 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
483 store( &y[j ], xmm1 );
484 store( &y[j+IT::size ], xmm2 );
485 store( &y[j+IT::size*2UL], xmm3 );
486 store( &y[j+IT::size*3UL], xmm4 );
488 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
490 for(
size_t i=0UL; i<M; ++i ) {
492 xmm1 = xmm1 + x1 * A.get(i,j );
493 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
494 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
496 store( &y[j ], xmm1 );
497 store( &y[j+IT::size ], xmm2 );
498 store( &y[j+IT::size*2UL], xmm3 );
500 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
502 for(
size_t i=0UL; i<M; ++i ) {
504 xmm1 = xmm1 + x1 * A.get(i,j );
505 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
507 store( &y[j ], xmm1 );
508 store( &y[j+IT::size], xmm2 );
512 for(
size_t i=0UL; i<M; ++i ) {
513 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
515 store( &y[j], xmm1 );
535 template<
typename VT1
538 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
539 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
541 selectDefaultAssignKernel( y, x, A );
561 template<
typename VT1
564 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
565 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
567 using boost::numeric_cast;
573 const int M ( numeric_cast<int>( A.rows() ) );
574 const int N ( numeric_cast<int>( A.columns() ) );
575 const int lda( numeric_cast<int>( A.spacing() ) );
577 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
578 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
599 template<
typename VT1
602 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
603 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
605 using boost::numeric_cast;
611 const int M ( numeric_cast<int>( A.rows() ) );
612 const int N ( numeric_cast<int>( A.columns() ) );
613 const int lda( numeric_cast<int>( A.spacing() ) );
615 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
616 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
637 template<
typename VT1
640 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
641 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
643 using boost::numeric_cast;
652 const int M ( numeric_cast<int>( A.rows() ) );
653 const int N ( numeric_cast<int>( A.columns() ) );
654 const int lda( numeric_cast<int>( A.spacing() ) );
655 const complex<float> alpha( 1.0F, 0.0F );
656 const complex<float> beta ( 0.0F, 0.0F );
658 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
659 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
680 template<
typename VT1
683 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
684 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
686 using boost::numeric_cast;
695 const int M ( numeric_cast<int>( A.rows() ) );
696 const int N ( numeric_cast<int>( A.columns() ) );
697 const int lda( numeric_cast<int>( A.spacing() ) );
698 const complex<double> alpha( 1.0, 0.0 );
699 const complex<double> beta ( 0.0, 0.0 );
701 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
702 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
721 template<
typename VT1 >
751 template<
typename VT1 >
758 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
770 if( ( IsComputation<MT>::value && !evaluate ) ||
772 TDVecDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
774 TDVecDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
793 template<
typename VT1
796 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
797 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
799 const size_t M( A.rows() );
800 const size_t N( A.columns() );
803 const size_t jend( N &
size_t(-2) );
805 for(
size_t i=0UL; i<M; ++i ) {
806 for(
size_t j=0UL; j<jend; j+=2UL ) {
807 y[j ] += x[i] * A(i,j );
808 y[j+1UL] += x[i] * A(i,j+1UL);
811 y[jend] += x[i] * A(i,jend);
832 template<
typename VT1
835 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
836 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
838 typedef IntrinsicTrait<ElementType> IT;
840 const size_t M( A.rows() );
841 const size_t N( A.columns() );
845 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
854 for(
size_t i=0UL; i<M; ++i ) {
856 xmm1 = xmm1 + x1 * A.get(i,j );
857 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
858 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
859 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
860 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
861 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
862 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
863 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
865 store( &y[j ], xmm1 );
866 store( &y[j+IT::size ], xmm2 );
867 store( &y[j+IT::size*2UL], xmm3 );
868 store( &y[j+IT::size*3UL], xmm4 );
869 store( &y[j+IT::size*4UL], xmm5 );
870 store( &y[j+IT::size*5UL], xmm6 );
871 store( &y[j+IT::size*6UL], xmm7 );
872 store( &y[j+IT::size*7UL], xmm8 );
874 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
879 for(
size_t i=0UL; i<M; ++i ) {
881 xmm1 = xmm1 + x1 * A.get(i,j );
882 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
883 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
884 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
886 store( &y[j ], xmm1 );
887 store( &y[j+IT::size ], xmm2 );
888 store( &y[j+IT::size*2UL], xmm3 );
889 store( &y[j+IT::size*3UL], xmm4 );
891 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
895 for(
size_t i=0UL; i<M; ++i ) {
897 xmm1 = xmm1 + x1 * A.get(i,j );
898 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
899 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
901 store( &y[j ], xmm1 );
902 store( &y[j+IT::size ], xmm2 );
903 store( &y[j+IT::size*2UL], xmm3 );
905 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
908 for(
size_t i=0UL; i<M; ++i ) {
910 xmm1 = xmm1 + x1 * A.get(i,j );
911 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
913 store( &y[j ], xmm1 );
914 store( &y[j+IT::size], xmm2 );
918 for(
size_t i=0UL; i<M; ++i ) {
919 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
921 store( &y[j], xmm1 );
941 template<
typename VT1
944 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
945 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
947 selectDefaultAddAssignKernel( y, x, A );
967 template<
typename VT1
970 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
971 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
973 using boost::numeric_cast;
979 const int M ( numeric_cast<int>( A.rows() ) );
980 const int N ( numeric_cast<int>( A.columns() ) );
981 const int lda( numeric_cast<int>( A.spacing() ) );
983 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, 1.0F,
984 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1005 template<
typename VT1
1008 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1009 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1011 using boost::numeric_cast;
1017 const int M ( numeric_cast<int>( A.rows() ) );
1018 const int N ( numeric_cast<int>( A.columns() ) );
1019 const int lda( numeric_cast<int>( A.spacing() ) );
1021 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, 1.0,
1022 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1043 template<
typename VT1
1046 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1047 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1049 using boost::numeric_cast;
1058 const int M ( numeric_cast<int>( A.rows() ) );
1059 const int N ( numeric_cast<int>( A.columns() ) );
1060 const int lda( numeric_cast<int>( A.spacing() ) );
1061 const complex<float> alpha( 1.0F, 0.0F );
1062 const complex<float> beta ( 1.0F, 0.0F );
1064 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1065 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1086 template<
typename VT1
1089 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1090 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1092 using boost::numeric_cast;
1101 const int M ( numeric_cast<int>( A.rows() ) );
1102 const int N ( numeric_cast<int>( A.columns() ) );
1103 const int lda( numeric_cast<int>( A.spacing() ) );
1104 const complex<double> alpha( 1.0, 0.0 );
1105 const complex<double> beta ( 1.0, 0.0 );
1107 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1108 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1131 template<
typename VT1 >
1138 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1150 if( ( IsComputation<MT>::value && !evaluate ) ||
1152 TDVecDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1154 TDVecDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1173 template<
typename VT1
1176 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1177 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1179 const size_t M( A.rows() );
1180 const size_t N( A.columns() );
1183 const size_t jend( N &
size_t(-2) );
1185 for(
size_t i=0UL; i<M; ++i ) {
1186 for(
size_t j=0UL; j<jend; j+=2UL ) {
1187 y[j ] -= x[i] * A(i,j );
1188 y[j+1UL] -= x[i] * A(i,j+1UL);
1191 y[jend] -= x[i] * A(i,jend);
1212 template<
typename VT1
1215 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1216 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1218 typedef IntrinsicTrait<ElementType> IT;
1220 const size_t M( A.rows() );
1221 const size_t N( A.columns() );
1225 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1234 for(
size_t i=0UL; i<M; ++i ) {
1236 xmm1 = xmm1 - x1 * A.get(i,j );
1237 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1238 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1239 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1240 xmm5 = xmm5 - x1 * A.get(i,j+IT::size*4UL);
1241 xmm6 = xmm6 - x1 * A.get(i,j+IT::size*5UL);
1242 xmm7 = xmm7 - x1 * A.get(i,j+IT::size*6UL);
1243 xmm8 = xmm8 - x1 * A.get(i,j+IT::size*7UL);
1245 store( &y[j ], xmm1 );
1246 store( &y[j+IT::size ], xmm2 );
1247 store( &y[j+IT::size*2UL], xmm3 );
1248 store( &y[j+IT::size*3UL], xmm4 );
1249 store( &y[j+IT::size*4UL], xmm5 );
1250 store( &y[j+IT::size*5UL], xmm6 );
1251 store( &y[j+IT::size*6UL], xmm7 );
1252 store( &y[j+IT::size*7UL], xmm8 );
1254 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1259 for(
size_t i=0UL; i<M; ++i ) {
1261 xmm1 = xmm1 - x1 * A.get(i,j );
1262 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1263 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1264 xmm4 = xmm4 - x1 * A.get(i,j+IT::size*3UL);
1266 store( &y[j ], xmm1 );
1267 store( &y[j+IT::size ], xmm2 );
1268 store( &y[j+IT::size*2UL], xmm3 );
1269 store( &y[j+IT::size*3UL], xmm4 );
1271 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1275 for(
size_t i=0UL; i<M; ++i ) {
1277 xmm1 = xmm1 - x1 * A.get(i,j );
1278 xmm2 = xmm2 - x1 * A.get(i,j+IT::size );
1279 xmm3 = xmm3 - x1 * A.get(i,j+IT::size*2UL);
1281 store( &y[j ], xmm1 );
1282 store( &y[j+IT::size ], xmm2 );
1283 store( &y[j+IT::size*2UL], xmm3 );
1285 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1288 for(
size_t i=0UL; i<M; ++i ) {
1290 xmm1 = xmm1 - x1 * A.get(i,j );
1291 xmm2 = xmm2 - x1 * A.get(i,j+IT::size);
1293 store( &y[j ], xmm1 );
1294 store( &y[j+IT::size], xmm2 );
1298 for(
size_t i=0UL; i<M; ++i ) {
1299 xmm1 = xmm1 -
set( x[i] ) * A.get(i,j);
1301 store( &y[j], xmm1 );
1321 template<
typename VT1
1324 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1325 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1327 selectDefaultSubAssignKernel( y, x, A );
1347 template<
typename VT1
1350 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1351 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1353 using boost::numeric_cast;
1359 const int M ( numeric_cast<int>( A.rows() ) );
1360 const int N ( numeric_cast<int>( A.columns() ) );
1361 const int lda( numeric_cast<int>( A.spacing() ) );
1363 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -1.0F,
1364 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1385 template<
typename VT1
1388 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1389 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1391 using boost::numeric_cast;
1397 const int M ( numeric_cast<int>( A.rows() ) );
1398 const int N ( numeric_cast<int>( A.columns() ) );
1399 const int lda( numeric_cast<int>( A.spacing() ) );
1401 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -1.0,
1402 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1423 template<
typename VT1
1426 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1427 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1429 using boost::numeric_cast;
1438 const int M ( numeric_cast<int>( A.rows() ) );
1439 const int N ( numeric_cast<int>( A.columns() ) );
1440 const int lda( numeric_cast<int>( A.spacing() ) );
1441 const complex<float> alpha( -1.0F, 0.0F );
1442 const complex<float> beta ( 1.0F, 0.0F );
1444 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1445 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1466 template<
typename VT1
1469 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1470 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1472 using boost::numeric_cast;
1481 const int M ( numeric_cast<int>( A.rows() ) );
1482 const int N ( numeric_cast<int>( A.columns() ) );
1483 const int lda( numeric_cast<int>( A.spacing() ) );
1484 const complex<double> alpha( -1.0, 0.0 );
1485 const complex<double> beta ( 1.0, 0.0 );
1487 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
1488 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1511 template<
typename VT1 >
1560 template<
typename VT
1564 :
public DenseVector< DVecScalarMultExpr< TDVecDMatMultExpr<VT,MT>, ST, true >, true >
1565 ,
private VecScalarMultExpr
1566 ,
private Computation
1570 typedef TDVecDMatMultExpr<VT,MT> VMM;
1571 typedef typename VMM::ResultType RES;
1572 typedef typename VT::ResultType
VRT;
1573 typedef typename MT::ResultType
MRT;
1574 typedef typename VRT::ElementType
VET;
1575 typedef typename MRT::ElementType
MET;
1576 typedef typename VT::CompositeType
VCT;
1577 typedef typename MT::CompositeType
MCT;
1582 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1583 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1591 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1592 struct UseSinglePrecisionKernel {
1593 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1594 IsFloat<typename T1::ElementType>::value &&
1595 IsFloat<typename T2::ElementType>::value &&
1596 IsFloat<typename T3::ElementType>::value &&
1597 !IsComplex<T4>::value };
1606 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1607 struct UseDoublePrecisionKernel {
1608 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1609 IsDouble<typename T1::ElementType>::value &&
1610 IsDouble<typename T2::ElementType>::value &&
1611 IsDouble<typename T3::ElementType>::value &&
1612 !IsComplex<T4>::value };
1621 template<
typename T1,
typename T2,
typename T3 >
1622 struct UseSinglePrecisionComplexKernel {
1623 typedef complex<float> Type;
1624 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1625 IsSame<typename T1::ElementType,Type>::value &&
1626 IsSame<typename T2::ElementType,Type>::value &&
1627 IsSame<typename T3::ElementType,Type>::value };
1636 template<
typename T1,
typename T2,
typename T3 >
1637 struct UseDoublePrecisionComplexKernel {
1638 typedef complex<double> Type;
1639 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1640 IsSame<typename T1::ElementType,Type>::value &&
1641 IsSame<typename T2::ElementType,Type>::value &&
1642 IsSame<typename T3::ElementType,Type>::value };
1650 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1651 struct UseDefaultKernel {
1652 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1653 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1654 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1655 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1664 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1665 struct UseVectorizedDefaultKernel {
1666 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1667 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1668 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1669 IsSame<typename T1::ElementType,T4>::value &&
1670 IntrinsicTrait<typename T1::ElementType>::addition &&
1671 IntrinsicTrait<typename T1::ElementType>::multiplication };
1677 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1678 typedef typename MultTrait<RES,ST>::Type
ResultType;
1680 typedef typename ResultType::ElementType
ElementType;
1681 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1686 typedef const TDVecDMatMultExpr<VT,MT>
LeftOperand;
1692 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
LT;
1695 typedef typename SelectType< evaluate, const MRT, MCT >::Type
RT;
1700 enum { vectorizable = 0 };
1709 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1723 return vector_[index] * scalar_;
1732 inline size_t size()
const {
1733 return vector_.size();
1763 template<
typename T >
1764 inline bool canAlias(
const T* alias )
const {
1765 return vector_.canAlias( alias );
1775 template<
typename T >
1776 inline bool isAliased(
const T* alias )
const {
1777 return vector_.isAliased( alias );
1799 template<
typename VT1 >
1800 friend inline void assign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
1806 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1807 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1809 if( right.rows() == 0UL ) {
1813 else if( right.columns() == 0UL ) {
1825 if( ( IsComputation<MT>::value && !evaluate ) ||
1827 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1829 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1847 template<
typename VT1
1851 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1852 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1854 const size_t M( A.rows() );
1855 const size_t N( A.columns() );
1858 const size_t jend( N &
size_t(-2) );
1860 for(
size_t j=0UL; j<N; ++j ) {
1861 y[j] = x[0UL] * A(0UL,j);
1863 for(
size_t i=1UL; i<M; ++i ) {
1864 for(
size_t j=0UL; j<jend; j+=2UL ) {
1865 y[j ] += x[i] * A(i,j );
1866 y[j+1UL] += x[i] * A(i,j+1UL);
1869 y[jend] += x[i] * A(i,jend);
1872 for(
size_t j=0UL; j<N; ++j ) {
1892 template<
typename VT1
1896 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1897 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1899 typedef IntrinsicTrait<ElementType> IT;
1901 const size_t M( A.rows() );
1902 const size_t N( A.columns() );
1908 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
1909 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1910 for(
size_t i=0UL; i<M; ++i ) {
1912 xmm1 = xmm1 + x1 * A.get(i,j );
1913 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1914 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1915 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1916 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
1917 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
1918 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
1919 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
1921 store( &y[j ], xmm1*factor );
1922 store( &y[j+IT::size ], xmm2*factor );
1923 store( &y[j+IT::size*2UL], xmm3*factor );
1924 store( &y[j+IT::size*3UL], xmm4*factor );
1925 store( &y[j+IT::size*4UL], xmm5*factor );
1926 store( &y[j+IT::size*5UL], xmm6*factor );
1927 store( &y[j+IT::size*6UL], xmm7*factor );
1928 store( &y[j+IT::size*7UL], xmm8*factor );
1930 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
1932 for(
size_t i=0UL; i<M; ++i ) {
1934 xmm1 = xmm1 + x1 * A.get(i,j );
1935 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1936 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1937 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
1939 store( &y[j ], xmm1*factor );
1940 store( &y[j+IT::size ], xmm2*factor );
1941 store( &y[j+IT::size*2UL], xmm3*factor );
1942 store( &y[j+IT::size*3UL], xmm4*factor );
1944 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
1946 for(
size_t i=0UL; i<M; ++i ) {
1948 xmm1 = xmm1 + x1 * A.get(i,j );
1949 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
1950 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
1952 store( &y[j ], xmm1*factor );
1953 store( &y[j+IT::size ], xmm2*factor );
1954 store( &y[j+IT::size*2UL], xmm3*factor );
1956 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
1958 for(
size_t i=0UL; i<M; ++i ) {
1960 xmm1 = xmm1 + x1 * A.get(i,j );
1961 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
1963 store( &y[j ], xmm1*factor );
1964 store( &y[j+IT::size], xmm2*factor );
1968 for(
size_t i=0UL; i<M; ++i ) {
1969 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
1971 store( &y[j], xmm1*factor );
1989 template<
typename VT1
1993 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1994 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1996 selectDefaultAssignKernel( y, x, A, scalar );
2015 template<
typename VT1
2019 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2020 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2022 using boost::numeric_cast;
2028 const int M ( numeric_cast<int>( A.rows() ) );
2029 const int N ( numeric_cast<int>( A.columns() ) );
2030 const int lda( numeric_cast<int>( A.spacing() ) );
2032 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2033 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2053 template<
typename VT1
2057 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2058 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2060 using boost::numeric_cast;
2066 const int M ( numeric_cast<int>( A.rows() ) );
2067 const int N ( numeric_cast<int>( A.columns() ) );
2068 const int lda( numeric_cast<int>( A.spacing() ) );
2070 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2071 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2091 template<
typename VT1
2095 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2096 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2098 using boost::numeric_cast;
2107 const int M ( numeric_cast<int>( A.rows() ) );
2108 const int N ( numeric_cast<int>( A.columns() ) );
2109 const int lda( numeric_cast<int>( A.spacing() ) );
2110 const complex<float> alpha( scalar );
2111 const complex<float> beta ( 0.0F, 0.0F );
2113 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2114 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2134 template<
typename VT1
2138 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2139 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2141 using boost::numeric_cast;
2150 const int M ( numeric_cast<int>( A.rows() ) );
2151 const int N ( numeric_cast<int>( A.columns() ) );
2152 const int lda( numeric_cast<int>( A.spacing() ) );
2153 const complex<double> alpha( scalar );
2154 const complex<double> beta ( 0.0, 0.0 );
2156 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2157 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2174 template<
typename VT1 >
2175 friend inline void assign( SparseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2202 template<
typename VT1 >
2203 friend inline void addAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2209 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2210 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2212 if( right.rows() == 0UL || right.columns() == 0UL ) {
2224 if( ( IsComputation<MT>::value && !evaluate ) ||
2226 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2228 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2246 template<
typename VT1
2250 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2251 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2253 y.addAssign( x * A * scalar );
2271 template<
typename VT1
2275 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2276 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2278 typedef IntrinsicTrait<ElementType> IT;
2280 const size_t M( A.rows() );
2281 const size_t N( A.columns() );
2287 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2288 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2289 for(
size_t i=0UL; i<M; ++i ) {
2291 xmm1 = xmm1 + x1 * A.get(i,j );
2292 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2293 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2294 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2295 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2296 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2297 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2298 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2300 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2301 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2302 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2303 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2304 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) + xmm5*factor );
2305 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) + xmm6*factor );
2306 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) + xmm7*factor );
2307 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) + xmm8*factor );
2309 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2311 for(
size_t i=0UL; i<M; ++i ) {
2313 xmm1 = xmm1 + x1 * A.get(i,j );
2314 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2315 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2316 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2318 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2319 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2320 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2321 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) + xmm4*factor );
2323 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2325 for(
size_t i=0UL; i<M; ++i ) {
2327 xmm1 = xmm1 + x1 * A.get(i,j );
2328 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2329 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2331 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2332 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) + xmm2*factor );
2333 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) + xmm3*factor );
2335 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2337 for(
size_t i=0UL; i<M; ++i ) {
2339 xmm1 = xmm1 + x1 * A.get(i,j );
2340 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2342 store( &y[j ],
load( &y[j ] ) + xmm1*factor );
2343 store( &y[j+IT::size],
load( &y[j+IT::size] ) + xmm2*factor );
2347 for(
size_t i=0UL; i<M; ++i ) {
2348 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2350 store( &y[j],
load( &y[j] ) + xmm1*factor );
2369 template<
typename VT1
2373 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2374 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2376 selectDefaultAddAssignKernel( y, x, A, scalar );
2395 template<
typename VT1
2399 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2400 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2402 using boost::numeric_cast;
2408 const int M ( numeric_cast<int>( A.rows() ) );
2409 const int N ( numeric_cast<int>( A.columns() ) );
2410 const int lda( numeric_cast<int>( A.spacing() ) );
2412 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2413 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2433 template<
typename VT1
2437 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2438 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2440 using boost::numeric_cast;
2446 const int M ( numeric_cast<int>( A.rows() ) );
2447 const int N ( numeric_cast<int>( A.columns() ) );
2448 const int lda( numeric_cast<int>( A.spacing() ) );
2450 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, scalar,
2451 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2471 template<
typename VT1
2475 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2476 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2478 using boost::numeric_cast;
2487 const int M ( numeric_cast<int>( A.rows() ) );
2488 const int N ( numeric_cast<int>( A.columns() ) );
2489 const int lda( numeric_cast<int>( A.spacing() ) );
2490 const complex<float> alpha( scalar );
2491 const complex<float> beta ( 1.0F, 0.0F );
2493 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2494 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2514 template<
typename VT1
2518 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2519 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2521 using boost::numeric_cast;
2530 const int M ( numeric_cast<int>( A.rows() ) );
2531 const int N ( numeric_cast<int>( A.columns() ) );
2532 const int lda( numeric_cast<int>( A.spacing() ) );
2533 const complex<double> alpha( scalar );
2534 const complex<double> beta ( 1.0, 0.0 );
2536 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2537 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2558 template<
typename VT1 >
2559 friend inline void subAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2565 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2566 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2568 if( right.rows() == 0UL || right.columns() == 0UL ) {
2580 if( ( IsComputation<MT>::value && !evaluate ) ||
2582 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2584 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2602 template<
typename VT1
2606 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2607 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2609 y.subAssign( x * A * scalar );
2627 template<
typename VT1
2631 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2632 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2634 typedef IntrinsicTrait<ElementType> IT;
2636 const size_t M( A.rows() );
2637 const size_t N( A.columns() );
2643 for( ; (j+IT::size*7UL) < N; j+=IT::size*8UL ) {
2644 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2645 for(
size_t i=0UL; i<M; ++i ) {
2647 xmm1 = xmm1 + x1 * A.get(i,j );
2648 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2649 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2650 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2651 xmm5 = xmm5 + x1 * A.get(i,j+IT::size*4UL);
2652 xmm6 = xmm6 + x1 * A.get(i,j+IT::size*5UL);
2653 xmm7 = xmm7 + x1 * A.get(i,j+IT::size*6UL);
2654 xmm8 = xmm8 + x1 * A.get(i,j+IT::size*7UL);
2656 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2657 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2658 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2659 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2660 store( &y[j+IT::size*4UL],
load( &y[j+IT::size*4UL] ) - xmm5*factor );
2661 store( &y[j+IT::size*5UL],
load( &y[j+IT::size*5UL] ) - xmm6*factor );
2662 store( &y[j+IT::size*6UL],
load( &y[j+IT::size*6UL] ) - xmm7*factor );
2663 store( &y[j+IT::size*7UL],
load( &y[j+IT::size*7UL] ) - xmm8*factor );
2665 for( ; (j+IT::size*3UL) < N; j+=IT::size*4UL ) {
2667 for(
size_t i=0UL; i<M; ++i ) {
2669 xmm1 = xmm1 + x1 * A.get(i,j );
2670 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2671 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2672 xmm4 = xmm4 + x1 * A.get(i,j+IT::size*3UL);
2674 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2675 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2676 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2677 store( &y[j+IT::size*3UL],
load( &y[j+IT::size*3UL] ) - xmm4*factor );
2679 for( ; (j+IT::size*2UL) < N; j+=IT::size*3UL ) {
2681 for(
size_t i=0UL; i<M; ++i ) {
2683 xmm1 = xmm1 + x1 * A.get(i,j );
2684 xmm2 = xmm2 + x1 * A.get(i,j+IT::size );
2685 xmm3 = xmm3 + x1 * A.get(i,j+IT::size*2UL);
2687 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2688 store( &y[j+IT::size ],
load( &y[j+IT::size ] ) - xmm2*factor );
2689 store( &y[j+IT::size*2UL],
load( &y[j+IT::size*2UL] ) - xmm3*factor );
2691 for( ; (j+IT::size) < N; j+=IT::size*2UL ) {
2693 for(
size_t i=0UL; i<M; ++i ) {
2695 xmm1 = xmm1 + x1 * A.get(i,j );
2696 xmm2 = xmm2 + x1 * A.get(i,j+IT::size);
2698 store( &y[j ],
load( &y[j ] ) - xmm1*factor );
2699 store( &y[j+IT::size],
load( &y[j+IT::size] ) - xmm2*factor );
2703 for(
size_t i=0UL; i<M; ++i ) {
2704 xmm1 = xmm1 +
set( x[i] ) * A.get(i,j);
2706 store( &y[j],
load( &y[j] ) - xmm1*factor );
2725 template<
typename VT1
2729 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2730 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2732 selectDefaultSubAssignKernel( y, x, A, scalar );
2751 template<
typename VT1
2755 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2756 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2758 using boost::numeric_cast;
2764 const int M ( numeric_cast<int>( A.rows() ) );
2765 const int N ( numeric_cast<int>( A.columns() ) );
2766 const int lda( numeric_cast<int>( A.spacing() ) );
2768 cblas_sgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2769 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2789 template<
typename VT1
2793 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2794 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2796 using boost::numeric_cast;
2802 const int M ( numeric_cast<int>( A.rows() ) );
2803 const int N ( numeric_cast<int>( A.columns() ) );
2804 const int lda( numeric_cast<int>( A.spacing() ) );
2806 cblas_dgemv( CblasRowMajor, CblasTrans, M, N, -scalar,
2807 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2827 template<
typename VT1
2831 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2832 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2834 using boost::numeric_cast;
2843 const int M ( numeric_cast<int>( A.rows() ) );
2844 const int N ( numeric_cast<int>( A.columns() ) );
2845 const int lda( numeric_cast<int>( A.spacing() ) );
2846 const complex<float> alpha( -scalar );
2847 const complex<float> beta ( 1.0F, 0.0F );
2849 cblas_cgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2850 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2870 template<
typename VT1
2874 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2875 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2877 using boost::numeric_cast;
2886 const int M ( numeric_cast<int>( A.rows() ) );
2887 const int N ( numeric_cast<int>( A.columns() ) );
2888 const int lda( numeric_cast<int>( A.spacing() ) );
2889 const complex<double> alpha( -scalar );
2890 const complex<double> beta ( 1.0, 0.0 );
2892 cblas_zgemv( CblasRowMajor, CblasTrans, M, N, &alpha,
2893 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2914 template<
typename VT1 >
2915 friend inline void multAssign( DenseVector<VT1,true>& lhs,
const DVecScalarMultExpr& rhs )
2988 template<
typename T1
2990 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecDMatMultExpr<T1,T2> >::Type
2995 if( (~vec).
size() != (~mat).
rows() )
2996 throw std::invalid_argument(
"Vector and matrix sizes do not match" );
3024 template<
typename T1
3027 inline const typename EnableIf< IsMatMatMultExpr<T2>, MultExprTrait<T1,T2> >::Type::Type