22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
85 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
91 typedef typename VT::ResultType
VRT;
92 typedef typename MT::ResultType
MRT;
93 typedef typename VRT::ElementType
VET;
94 typedef typename MRT::ElementType
MET;
95 typedef typename VT::CompositeType
VCT;
96 typedef typename MT::CompositeType
MCT;
111 template<
typename T1,
typename T2,
typename T3 >
112 struct UseSinglePrecisionKernel {
113 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
127 template<
typename T1,
typename T2,
typename T3 >
128 struct UseDoublePrecisionKernel {
129 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
130 IsDouble<typename T1::ElementType>::value &&
131 IsDouble<typename T2::ElementType>::value &&
132 IsDouble<typename T3::ElementType>::value };
143 template<
typename T1,
typename T2,
typename T3 >
144 struct UseSinglePrecisionComplexKernel {
145 typedef complex<float> Type;
146 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
147 IsSame<typename T1::ElementType,Type>::value &&
148 IsSame<typename T2::ElementType,Type>::value &&
149 IsSame<typename T3::ElementType,Type>::value };
160 template<
typename T1,
typename T2,
typename T3 >
161 struct UseDoublePrecisionComplexKernel {
162 typedef complex<double> Type;
163 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 IsSame<typename T1::ElementType,Type>::value &&
165 IsSame<typename T2::ElementType,Type>::value &&
166 IsSame<typename T3::ElementType,Type>::value };
176 template<
typename T1,
typename T2,
typename T3 >
177 struct UseDefaultKernel {
178 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
179 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
180 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
181 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
192 template<
typename T1,
typename T2,
typename T3 >
193 struct UseVectorizedDefaultKernel {
194 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
195 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
196 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
197 IntrinsicTrait<typename T1::ElementType>::addition &&
198 IntrinsicTrait<typename T1::ElementType>::multiplication };
228 enum { vectorizable = 0 };
257 if(
mat_.rows() != 0UL ) {
259 for(
size_t j=1UL; j<
end_; j+=2UL ) {
262 if( end_ < mat_.rows() ) {
280 return mat_.columns();
310 template<
typename T >
312 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
322 template<
typename T >
324 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
348 template<
typename VT1 >
355 if( rhs.mat_.rows() == 0UL ) {
359 else if( rhs.mat_.columns() == 0UL ) {
373 TDVecTDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
375 TDVecTDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
394 template<
typename VT1
398 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
419 template<
typename VT1
422 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
423 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
425 typedef IntrinsicTrait<ElementType> IT;
427 const size_t M( A.rows() );
428 const size_t N( A.columns() );
432 for( ; (j+8UL) <= N; j+=8UL ) {
433 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
434 for(
size_t i=0UL; i<M; i+=IT::size ) {
436 xmm1 = xmm1 + x1 * A.get(i,j );
437 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
438 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
439 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
440 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
441 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
442 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
443 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
446 y[j+1UL] =
sum( xmm2 );
447 y[j+2UL] =
sum( xmm3 );
448 y[j+3UL] =
sum( xmm4 );
449 y[j+4UL] =
sum( xmm5 );
450 y[j+5UL] =
sum( xmm6 );
451 y[j+6UL] =
sum( xmm7 );
452 y[j+7UL] =
sum( xmm8 );
454 for( ; (j+4UL) <= N; j+=4UL ) {
456 for(
size_t i=0UL; i<M; i+=IT::size ) {
458 xmm1 = xmm1 + x1 * A.get(i,j );
459 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
460 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
461 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
464 y[j+1UL] =
sum( xmm2 );
465 y[j+2UL] =
sum( xmm3 );
466 y[j+3UL] =
sum( xmm4 );
468 for( ; (j+3UL) <= N; j+=3UL ) {
470 for(
size_t i=0UL; i<M; i+=IT::size ) {
472 xmm1 = xmm1 + x1 * A.get(i,j );
473 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
474 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
477 y[j+1UL] =
sum( xmm2 );
478 y[j+2UL] =
sum( xmm3 );
480 for( ; (j+2UL) <= N; j+=2UL ) {
482 for(
size_t i=0UL; i<M; i+=IT::size ) {
484 xmm1 = xmm1 + x1 * A.get(i,j );
485 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
488 y[j+1UL] =
sum( xmm2 );
492 for(
size_t i=0UL; i<M; i+=IT::size ) {
493 xmm1 = xmm1 + A.get(i,j) * x.get(i);
515 template<
typename VT1
518 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
519 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
521 selectDefaultAssignKernel( y, x, A );
541 template<
typename VT1
544 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
545 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
547 using boost::numeric_cast;
553 const int M ( numeric_cast<int>( A.rows() ) );
554 const int N ( numeric_cast<int>( A.columns() ) );
555 const int lda( numeric_cast<int>( A.spacing() ) );
557 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
558 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
579 template<
typename VT1
582 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
583 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
585 using boost::numeric_cast;
591 const int M ( numeric_cast<int>( A.rows() ) );
592 const int N ( numeric_cast<int>( A.columns() ) );
593 const int lda( numeric_cast<int>( A.spacing() ) );
595 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
596 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
617 template<
typename VT1
620 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
621 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
623 using boost::numeric_cast;
632 const int M ( numeric_cast<int>( A.rows() ) );
633 const int N ( numeric_cast<int>( A.columns() ) );
634 const int lda( numeric_cast<int>( A.spacing() ) );
635 const complex<float> alpha( 1.0F, 0.0F );
636 const complex<float> beta ( 0.0F, 0.0F );
638 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
639 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
660 template<
typename VT1
663 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
664 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
666 using boost::numeric_cast;
675 const int M ( numeric_cast<int>( A.rows() ) );
676 const int N ( numeric_cast<int>( A.columns() ) );
677 const int lda( numeric_cast<int>( A.spacing() ) );
678 const complex<double> alpha( 1.0, 0.0 );
679 const complex<double> beta ( 0.0, 0.0 );
681 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
682 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
701 template<
typename VT1 >
731 template<
typename VT1 >
738 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
750 if( ( IsComputation<MT>::value && !evaluate ) ||
752 TDVecTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
754 TDVecTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
773 template<
typename VT1
776 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
777 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
779 y.addAssign( x * A );
798 template<
typename VT1
801 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
802 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
804 typedef IntrinsicTrait<ElementType> IT;
806 const size_t M( A.rows() );
807 const size_t N( A.columns() );
811 for( ; (j+8UL) <= N; j+=8UL ) {
812 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
813 for(
size_t i=0UL; i<M; i+=IT::size ) {
815 xmm1 = xmm1 + x1 * A.get(i,j );
816 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
817 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
818 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
819 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
820 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
821 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
822 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
824 y[j ] +=
sum( xmm1 );
825 y[j+1UL] +=
sum( xmm2 );
826 y[j+2UL] +=
sum( xmm3 );
827 y[j+3UL] +=
sum( xmm4 );
828 y[j+4UL] +=
sum( xmm5 );
829 y[j+5UL] +=
sum( xmm6 );
830 y[j+6UL] +=
sum( xmm7 );
831 y[j+7UL] +=
sum( xmm8 );
833 for( ; (j+4UL) <= N; j+=4UL ) {
835 for(
size_t i=0UL; i<M; i+=IT::size ) {
837 xmm1 = xmm1 + x1 * A.get(i,j );
838 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
839 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
840 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
842 y[j ] +=
sum( xmm1 );
843 y[j+1UL] +=
sum( xmm2 );
844 y[j+2UL] +=
sum( xmm3 );
845 y[j+3UL] +=
sum( xmm4 );
847 for( ; (j+3UL) <= N; j+=3UL ) {
849 for(
size_t i=0UL; i<M; i+=IT::size ) {
851 xmm1 = xmm1 + x1 * A.get(i,j );
852 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
853 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
855 y[j ] +=
sum( xmm1 );
856 y[j+1UL] +=
sum( xmm2 );
857 y[j+2UL] +=
sum( xmm3 );
859 for( ; (j+2UL) <= N; j+=2UL ) {
861 for(
size_t i=0UL; i<M; i+=IT::size ) {
863 xmm1 = xmm1 + x1 * A.get(i,j );
864 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
866 y[j ] +=
sum( xmm1 );
867 y[j+1UL] +=
sum( xmm2 );
871 for(
size_t i=0UL; i<M; i+=IT::size ) {
872 xmm1 = xmm1 + A.get(i,j) * x.get(i);
894 template<
typename VT1
897 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
898 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
900 selectDefaultAddAssignKernel( y, x, A );
920 template<
typename VT1
923 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
924 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
926 using boost::numeric_cast;
932 const int M ( numeric_cast<int>( A.rows() ) );
933 const int N ( numeric_cast<int>( A.columns() ) );
934 const int lda( numeric_cast<int>( A.spacing() ) );
936 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
937 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
958 template<
typename VT1
961 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
962 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
964 using boost::numeric_cast;
970 const int M ( numeric_cast<int>( A.rows() ) );
971 const int N ( numeric_cast<int>( A.columns() ) );
972 const int lda( numeric_cast<int>( A.spacing() ) );
974 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
975 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
996 template<
typename VT1
999 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1000 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1002 using boost::numeric_cast;
1011 const int M ( numeric_cast<int>( A.rows() ) );
1012 const int N ( numeric_cast<int>( A.columns() ) );
1013 const int lda( numeric_cast<int>( A.spacing() ) );
1014 const complex<float> alpha( 1.0F, 0.0F );
1015 const complex<float> beta ( 1.0F, 0.0F );
1017 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1018 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1039 template<
typename VT1
1042 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1043 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1045 using boost::numeric_cast;
1054 const int M ( numeric_cast<int>( A.rows() ) );
1055 const int N ( numeric_cast<int>( A.columns() ) );
1056 const int lda( numeric_cast<int>( A.spacing() ) );
1057 const complex<double> alpha( 1.0, 0.0 );
1058 const complex<double> beta ( 1.0, 0.0 );
1060 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1061 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1084 template<
typename VT1 >
1091 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1103 if( ( IsComputation<MT>::value && !evaluate ) ||
1105 TDVecTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1107 TDVecTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1126 template<
typename VT1
1129 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1130 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1132 y.subAssign( x * A );
1151 template<
typename VT1
1154 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1155 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1157 typedef IntrinsicTrait<ElementType> IT;
1159 const size_t M( A.rows() );
1160 const size_t N( A.columns() );
1164 for( ; (j+8UL) <= N; j+=8UL ) {
1165 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1166 for(
size_t i=0UL; i<M; i+=IT::size ) {
1168 xmm1 = xmm1 + x1 * A.get(i,j );
1169 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1170 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1171 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1172 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
1173 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
1174 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
1175 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
1177 y[j ] -=
sum( xmm1 );
1178 y[j+1UL] -=
sum( xmm2 );
1179 y[j+2UL] -=
sum( xmm3 );
1180 y[j+3UL] -=
sum( xmm4 );
1181 y[j+4UL] -=
sum( xmm5 );
1182 y[j+5UL] -=
sum( xmm6 );
1183 y[j+6UL] -=
sum( xmm7 );
1184 y[j+7UL] -=
sum( xmm8 );
1186 for( ; (j+4UL) <= N; j+=4UL ) {
1188 for(
size_t i=0UL; i<M; i+=IT::size ) {
1190 xmm1 = xmm1 + x1 * A.get(i,j );
1191 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1192 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1193 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1195 y[j ] -=
sum( xmm1 );
1196 y[j+1UL] -=
sum( xmm2 );
1197 y[j+2UL] -=
sum( xmm3 );
1198 y[j+3UL] -=
sum( xmm4 );
1200 for( ; (j+3UL) <= N; j+=3UL ) {
1202 for(
size_t i=0UL; i<M; i+=IT::size ) {
1204 xmm1 = xmm1 + x1 * A.get(i,j );
1205 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1206 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1208 y[j ] -=
sum( xmm1 );
1209 y[j+1UL] -=
sum( xmm2 );
1210 y[j+2UL] -=
sum( xmm3 );
1212 for( ; (j+2UL) <= N; j+=2UL ) {
1214 for(
size_t i=0UL; i<M; i+=IT::size ) {
1216 xmm1 = xmm1 + x1 * A.get(i,j );
1217 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1219 y[j ] -=
sum( xmm1 );
1220 y[j+1UL] -=
sum( xmm2 );
1224 for(
size_t i=0UL; i<M; i+=IT::size ) {
1225 xmm1 = xmm1 + A.get(i,j) * x.get(i);
1227 y[j] -=
sum( xmm1 );
1247 template<
typename VT1
1250 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1251 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1253 selectDefaultSubAssignKernel( y, x, A );
1273 template<
typename VT1
1276 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1277 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1279 using boost::numeric_cast;
1285 const int M ( numeric_cast<int>( A.rows() ) );
1286 const int N ( numeric_cast<int>( A.columns() ) );
1287 const int lda( numeric_cast<int>( A.spacing() ) );
1289 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -1.0F,
1290 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1311 template<
typename VT1
1314 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1315 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1317 using boost::numeric_cast;
1323 const int M ( numeric_cast<int>( A.rows() ) );
1324 const int N ( numeric_cast<int>( A.columns() ) );
1325 const int lda( numeric_cast<int>( A.spacing() ) );
1327 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -1.0,
1328 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1349 template<
typename VT1
1352 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1353 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1355 using boost::numeric_cast;
1364 const int M ( numeric_cast<int>( A.rows() ) );
1365 const int N ( numeric_cast<int>( A.columns() ) );
1366 const int lda( numeric_cast<int>( A.spacing() ) );
1367 const complex<float> alpha( -1.0F, 0.0F );
1368 const complex<float> beta ( 1.0F, 0.0F );
1370 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1371 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1392 template<
typename VT1
1395 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1396 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1398 using boost::numeric_cast;
1407 const int M ( numeric_cast<int>( A.rows() ) );
1408 const int N ( numeric_cast<int>( A.columns() ) );
1409 const int lda( numeric_cast<int>( A.spacing() ) );
1410 const complex<double> alpha( -1.0, 0.0 );
1411 const complex<double> beta ( 1.0, 0.0 );
1413 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1414 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1437 template<
typename VT1 >
1486 template<
typename VT
1490 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
1491 ,
private Expression
1492 ,
private Computation
1496 typedef TDVecTDMatMultExpr<VT,MT> VMM;
1497 typedef typename VMM::ResultType RES;
1498 typedef typename VT::ResultType
VRT;
1499 typedef typename MT::ResultType
MRT;
1500 typedef typename VRT::ElementType
VET;
1501 typedef typename MRT::ElementType
MET;
1502 typedef typename VT::CompositeType
VCT;
1503 typedef typename MT::CompositeType
MCT;
1508 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1509 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1517 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1518 struct UseSinglePrecisionKernel {
1519 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1520 IsFloat<typename T1::ElementType>::value &&
1521 IsFloat<typename T2::ElementType>::value &&
1522 IsFloat<typename T3::ElementType>::value &&
1523 !IsComplex<T4>::value };
1532 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1533 struct UseDoublePrecisionKernel {
1534 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1535 IsDouble<typename T1::ElementType>::value &&
1536 IsDouble<typename T2::ElementType>::value &&
1537 IsDouble<typename T3::ElementType>::value &&
1538 !IsComplex<T4>::value };
1547 template<
typename T1,
typename T2,
typename T3 >
1548 struct UseSinglePrecisionComplexKernel {
1549 typedef complex<float> Type;
1550 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1551 IsSame<typename T1::ElementType,Type>::value &&
1552 IsSame<typename T2::ElementType,Type>::value &&
1553 IsSame<typename T3::ElementType,Type>::value };
1562 template<
typename T1,
typename T2,
typename T3 >
1563 struct UseDoublePrecisionComplexKernel {
1564 typedef complex<double> Type;
1565 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1566 IsSame<typename T1::ElementType,Type>::value &&
1567 IsSame<typename T2::ElementType,Type>::value &&
1568 IsSame<typename T3::ElementType,Type>::value };
1576 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1577 struct UseDefaultKernel {
1578 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1579 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1580 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1581 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1590 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1591 struct UseVectorizedDefaultKernel {
1592 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1593 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1594 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1595 IsSame<typename T1::ElementType,T4>::value &&
1596 IntrinsicTrait<typename T1::ElementType>::addition &&
1597 IntrinsicTrait<typename T1::ElementType>::multiplication };
1603 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1604 typedef typename MultTrait<RES,ST>::Type
ResultType;
1606 typedef typename ResultType::ElementType
ElementType;
1607 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1612 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
1618 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
LT;
1621 typedef typename SelectType< evaluate, const MRT, MCT >::Type
RT;
1626 enum { vectorizable = 0 };
1635 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1649 return vector_[index] * scalar_;
1658 inline size_t size()
const {
1659 return vector_.size();
1689 template<
typename T >
1690 inline bool canAlias(
const T* alias )
const {
1691 return vector_.canAlias( alias );
1701 template<
typename T >
1702 inline bool isAliased(
const T* alias )
const {
1703 return vector_.isAliased( alias );
1725 template<
typename VT1
1727 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
1733 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1734 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1736 if( right.rows() == 0UL ) {
1740 else if( right.columns() == 0UL ) {
1752 if( ( IsComputation<MT>::value && !evaluate ) ||
1754 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1756 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1774 template<
typename VT1
1778 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1779 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1781 y.assign( x * A * scalar );
1799 template<
typename VT1
1803 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1804 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1806 typedef IntrinsicTrait<ElementType> IT;
1808 const size_t M( A.rows() );
1809 const size_t N( A.columns() );
1813 for( ; (j+8UL) <= N; j+=8UL ) {
1814 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1815 for(
size_t i=0UL; i<M; i+=IT::size ) {
1817 xmm1 = xmm1 + x1 * A.get(i,j );
1818 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1819 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1820 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1821 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
1822 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
1823 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
1824 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
1826 y[j ] =
sum( xmm1 ) * scalar;
1827 y[j+1UL] =
sum( xmm2 ) * scalar;
1828 y[j+2UL] =
sum( xmm3 ) * scalar;
1829 y[j+3UL] =
sum( xmm4 ) * scalar;
1830 y[j+4UL] =
sum( xmm5 ) * scalar;
1831 y[j+5UL] =
sum( xmm6 ) * scalar;
1832 y[j+6UL] =
sum( xmm7 ) * scalar;
1833 y[j+7UL] =
sum( xmm8 ) * scalar;
1835 for( ; (j+4UL) <= N; j+=4UL ) {
1837 for(
size_t i=0UL; i<M; i+=IT::size ) {
1839 xmm1 = xmm1 + x1 * A.get(i,j );
1840 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1841 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1842 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1844 y[j ] =
sum( xmm1 ) * scalar;
1845 y[j+1UL] =
sum( xmm2 ) * scalar;
1846 y[j+2UL] =
sum( xmm3 ) * scalar;
1847 y[j+3UL] =
sum( xmm4 ) * scalar;
1849 for( ; (j+3UL) <= N; j+=3UL ) {
1851 for(
size_t i=0UL; i<M; i+=IT::size ) {
1853 xmm1 = xmm1 + x1 * A.get(i,j );
1854 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1855 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1857 y[j ] =
sum( xmm1 ) * scalar;
1858 y[j+1UL] =
sum( xmm2 ) * scalar;
1859 y[j+2UL] =
sum( xmm3 ) * scalar;
1861 for( ; (j+2UL) <= N; j+=2UL ) {
1863 for(
size_t i=0UL; i<M; i+=IT::size ) {
1865 xmm1 = xmm1 + x1 * A.get(i,j );
1866 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1868 y[j ] =
sum( xmm1 ) * scalar;
1869 y[j+1UL] =
sum( xmm2 ) * scalar;
1873 for(
size_t i=0UL; i<M; i+=IT::size ) {
1874 xmm1 = xmm1 + A.get(i,j) * x.get(i);
1876 y[j] =
sum( xmm1 ) * scalar;
1894 template<
typename VT1
1898 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1899 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1901 selectDefaultAssignKernel( y, x, A, scalar );
1920 template<
typename VT1
1924 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1925 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1927 using boost::numeric_cast;
1933 const int M ( numeric_cast<int>( A.rows() ) );
1934 const int N ( numeric_cast<int>( A.columns() ) );
1935 const int lda( numeric_cast<int>( A.spacing() ) );
1937 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
1938 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
1958 template<
typename VT1
1962 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1963 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1965 using boost::numeric_cast;
1971 const int M ( numeric_cast<int>( A.rows() ) );
1972 const int N ( numeric_cast<int>( A.columns() ) );
1973 const int lda( numeric_cast<int>( A.spacing() ) );
1975 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
1976 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
1997 template<
typename VT1
2001 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2002 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2004 using boost::numeric_cast;
2014 const int M ( numeric_cast<int>( A.rows() ) );
2015 const int N ( numeric_cast<int>( A.columns() ) );
2016 const int lda( numeric_cast<int>( A.spacing() ) );
2017 const complex<float> alpha( scalar );
2018 const complex<float> beta ( 0.0F, 0.0F );
2020 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2021 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2042 template<
typename VT1
2046 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2047 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2049 using boost::numeric_cast;
2059 const int M ( numeric_cast<int>( A.rows() ) );
2060 const int N ( numeric_cast<int>( A.columns() ) );
2061 const int lda( numeric_cast<int>( A.spacing() ) );
2062 const complex<double> alpha( scalar );
2063 const complex<double> beta ( 0.0, 0.0 );
2065 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2066 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2083 template<
typename VT1
2085 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2112 template<
typename VT1
2114 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2120 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2121 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2123 if( right.rows() == 0UL || right.columns() == 0UL ) {
2135 if( ( IsComputation<MT>::value && !evaluate ) ||
2137 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2139 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2157 template<
typename VT1
2161 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2162 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2164 y.addAssign( x * A * scalar );
2182 template<
typename VT1
2186 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2187 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2189 typedef IntrinsicTrait<ElementType> IT;
2191 const size_t M( A.rows() );
2192 const size_t N( A.columns() );
2196 for( ; (j+8UL) <= N; j+=8UL ) {
2197 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2198 for(
size_t i=0UL; i<M; i+=IT::size ) {
2200 xmm1 = xmm1 + x1 * A.get(i,j );
2201 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2202 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2203 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2204 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
2205 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
2206 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
2207 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
2209 y[j ] +=
sum( xmm1 ) * scalar;
2210 y[j+1UL] +=
sum( xmm2 ) * scalar;
2211 y[j+2UL] +=
sum( xmm3 ) * scalar;
2212 y[j+3UL] +=
sum( xmm4 ) * scalar;
2213 y[j+4UL] +=
sum( xmm5 ) * scalar;
2214 y[j+5UL] +=
sum( xmm6 ) * scalar;
2215 y[j+6UL] +=
sum( xmm7 ) * scalar;
2216 y[j+7UL] +=
sum( xmm8 ) * scalar;
2218 for( ; (j+4UL) <= N; j+=4UL ) {
2220 for(
size_t i=0UL; i<M; i+=IT::size ) {
2222 xmm1 = xmm1 + x1 * A.get(i,j );
2223 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2224 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2225 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2227 y[j ] +=
sum( xmm1 ) * scalar;
2228 y[j+1UL] +=
sum( xmm2 ) * scalar;
2229 y[j+2UL] +=
sum( xmm3 ) * scalar;
2230 y[j+3UL] +=
sum( xmm4 ) * scalar;
2232 for( ; (j+3UL) <= N; j+=3UL ) {
2234 for(
size_t i=0UL; i<M; i+=IT::size ) {
2236 xmm1 = xmm1 + x1 * A.get(i,j );
2237 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2238 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2240 y[j ] +=
sum( xmm1 ) * scalar;
2241 y[j+1UL] +=
sum( xmm2 ) * scalar;
2242 y[j+2UL] +=
sum( xmm3 ) * scalar;
2244 for( ; (j+2UL) <= N; j+=2UL ) {
2246 for(
size_t i=0UL; i<M; i+=IT::size ) {
2248 xmm1 = xmm1 + x1 * A.get(i,j );
2249 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2251 y[j ] +=
sum( xmm1 ) * scalar;
2252 y[j+1UL] +=
sum( xmm2 ) * scalar;
2256 for(
size_t i=0UL; i<M; i+=IT::size ) {
2257 xmm1 = xmm1 + A.get(i,j) * x.get(i);
2259 y[j] +=
sum( xmm1 ) * scalar;
2278 template<
typename VT1
2282 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2283 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2285 selectDefaultAddAssignKernel( y, x, A, scalar );
2304 template<
typename VT1
2308 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2309 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2311 using boost::numeric_cast;
2317 const int M ( numeric_cast<int>( A.rows() ) );
2318 const int N ( numeric_cast<int>( A.columns() ) );
2319 const int lda( numeric_cast<int>( A.spacing() ) );
2321 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2322 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2342 template<
typename VT1
2346 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2347 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2349 using boost::numeric_cast;
2355 const int M ( numeric_cast<int>( A.rows() ) );
2356 const int N ( numeric_cast<int>( A.columns() ) );
2357 const int lda( numeric_cast<int>( A.spacing() ) );
2359 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2360 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2381 template<
typename VT1
2385 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2386 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2388 using boost::numeric_cast;
2398 const int M ( numeric_cast<int>( A.rows() ) );
2399 const int N ( numeric_cast<int>( A.columns() ) );
2400 const int lda( numeric_cast<int>( A.spacing() ) );
2401 const complex<float> alpha( scalar );
2402 const complex<float> beta ( 1.0F, 0.0F );
2404 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2405 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2426 template<
typename VT1
2430 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2431 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2433 using boost::numeric_cast;
2443 const int M ( numeric_cast<int>( A.rows() ) );
2444 const int N ( numeric_cast<int>( A.columns() ) );
2445 const int lda( numeric_cast<int>( A.spacing() ) );
2446 const complex<double> alpha( scalar );
2447 const complex<double> beta ( 1.0, 0.0 );
2449 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2450 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2471 template<
typename VT1
2473 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2479 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2480 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2482 if( right.rows() == 0UL || right.columns() == 0UL ) {
2494 if( ( IsComputation<MT>::value && !evaluate ) ||
2496 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2498 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2516 template<
typename VT1
2520 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2521 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2523 y.subAssign( x * A * scalar );
2541 template<
typename VT1
2545 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2546 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2548 typedef IntrinsicTrait<ElementType> IT;
2550 const size_t M( A.rows() );
2551 const size_t N( A.columns() );
2555 for( ; (j+8UL) <= N; j+=8UL ) {
2556 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2557 for(
size_t i=0UL; i<M; i+=IT::size ) {
2559 xmm1 = xmm1 + x1 * A.get(i,j );
2560 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2561 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2562 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2563 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
2564 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
2565 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
2566 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
2568 y[j ] -=
sum( xmm1 ) * scalar;
2569 y[j+1UL] -=
sum( xmm2 ) * scalar;
2570 y[j+2UL] -=
sum( xmm3 ) * scalar;
2571 y[j+3UL] -=
sum( xmm4 ) * scalar;
2572 y[j+4UL] -=
sum( xmm5 ) * scalar;
2573 y[j+5UL] -=
sum( xmm6 ) * scalar;
2574 y[j+6UL] -=
sum( xmm7 ) * scalar;
2575 y[j+7UL] -=
sum( xmm8 ) * scalar;
2577 for( ; (j+4UL) <= N; j+=4UL ) {
2579 for(
size_t i=0UL; i<M; i+=IT::size ) {
2581 xmm1 = xmm1 + x1 * A.get(i,j );
2582 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2583 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2584 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2586 y[j ] -=
sum( xmm1 ) * scalar;
2587 y[j+1UL] -=
sum( xmm2 ) * scalar;
2588 y[j+2UL] -=
sum( xmm3 ) * scalar;
2589 y[j+3UL] -=
sum( xmm4 ) * scalar;
2591 for( ; (j+3UL) <= N; j+=3UL ) {
2593 for(
size_t i=0UL; i<M; i+=IT::size ) {
2595 xmm1 = xmm1 + x1 * A.get(i,j );
2596 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2597 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2599 y[j ] -=
sum( xmm1 ) * scalar;
2600 y[j+1UL] -=
sum( xmm2 ) * scalar;
2601 y[j+2UL] -=
sum( xmm3 ) * scalar;
2603 for( ; (j+2UL) <= N; j+=2UL ) {
2605 for(
size_t i=0UL; i<M; i+=IT::size ) {
2607 xmm1 = xmm1 + x1 * A.get(i,j );
2608 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2610 y[j ] -=
sum( xmm1 ) * scalar;
2611 y[j+1UL] -=
sum( xmm2 ) * scalar;
2615 for(
size_t i=0UL; i<M; i+=IT::size ) {
2616 xmm1 = xmm1 + A.get(i,j) * x.get(i);
2618 y[j] -=
sum( xmm1 ) * scalar;
2638 template<
typename VT1
2642 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2643 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2645 selectDefaultSubAssignKernel( y, x, A, scalar );
2664 template<
typename VT1
2668 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2669 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2671 using boost::numeric_cast;
2677 const int M ( numeric_cast<int>( A.rows() ) );
2678 const int N ( numeric_cast<int>( A.columns() ) );
2679 const int lda( numeric_cast<int>( A.spacing() ) );
2681 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2682 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2702 template<
typename VT1
2706 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2707 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2709 using boost::numeric_cast;
2715 const int M ( numeric_cast<int>( A.rows() ) );
2716 const int N ( numeric_cast<int>( A.columns() ) );
2717 const int lda( numeric_cast<int>( A.spacing() ) );
2719 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2720 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2742 template<
typename VT1
2746 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2747 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2749 using boost::numeric_cast;
2759 const int M ( numeric_cast<int>( A.rows() ) );
2760 const int N ( numeric_cast<int>( A.columns() ) );
2761 const int lda( numeric_cast<int>( A.spacing() ) );
2762 const complex<float> alpha( -scalar );
2763 const complex<float> beta ( 1.0F, 0.0F );
2765 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2766 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2788 template<
typename VT1
2792 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2793 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2795 using boost::numeric_cast;
2805 const int M ( numeric_cast<int>( A.rows() ) );
2806 const int N ( numeric_cast<int>( A.columns() ) );
2807 const int lda( numeric_cast<int>( A.spacing() ) );
2808 const complex<double> alpha( -scalar );
2809 const complex<double> beta ( 1.0, 0.0 );
2811 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2812 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2833 template<
typename VT1
2835 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2907 template<
typename T1
2909 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
2914 if( (~vec).
size() != (~mat).
rows() )
2915 throw std::invalid_argument(
"Vector and matrix sizes do not match" );