22 #ifndef _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
23 #define _BLAZE_MATH_EXPRESSIONS_TDVECTDMATMULTEXPR_H_
31 #include <boost/cast.hpp>
87 class TDVecTDMatMultExpr :
public DenseVector< TDVecTDMatMultExpr<VT,MT>, true >
88 ,
private TVecMatMultExpr
93 typedef typename VT::ResultType
VRT;
94 typedef typename MT::ResultType
MRT;
95 typedef typename VRT::ElementType
VET;
96 typedef typename MRT::ElementType
MET;
97 typedef typename VT::CompositeType
VCT;
98 typedef typename MT::CompositeType
MCT;
113 template<
typename T1,
typename T2,
typename T3 >
114 struct UseSinglePrecisionKernel {
115 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
129 template<
typename T1,
typename T2,
typename T3 >
130 struct UseDoublePrecisionKernel {
131 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
132 IsDouble<typename T1::ElementType>::value &&
133 IsDouble<typename T2::ElementType>::value &&
134 IsDouble<typename T3::ElementType>::value };
145 template<
typename T1,
typename T2,
typename T3 >
146 struct UseSinglePrecisionComplexKernel {
147 typedef complex<float> Type;
148 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
149 IsSame<typename T1::ElementType,Type>::value &&
150 IsSame<typename T2::ElementType,Type>::value &&
151 IsSame<typename T3::ElementType,Type>::value };
162 template<
typename T1,
typename T2,
typename T3 >
163 struct UseDoublePrecisionComplexKernel {
164 typedef complex<double> Type;
165 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
166 IsSame<typename T1::ElementType,Type>::value &&
167 IsSame<typename T2::ElementType,Type>::value &&
168 IsSame<typename T3::ElementType,Type>::value };
178 template<
typename T1,
typename T2,
typename T3 >
179 struct UseDefaultKernel {
180 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
181 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
182 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
183 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
194 template<
typename T1,
typename T2,
typename T3 >
195 struct UseVectorizedDefaultKernel {
196 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
197 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
198 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
199 IntrinsicTrait<typename T1::ElementType>::addition &&
200 IntrinsicTrait<typename T1::ElementType>::multiplication };
230 enum { vectorizable = 0 };
259 if(
mat_.rows() != 0UL ) {
261 for(
size_t j=1UL; j<
end_; j+=2UL ) {
264 if( end_ < mat_.rows() ) {
282 return mat_.columns();
312 template<
typename T >
314 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
324 template<
typename T >
326 return (
vec_.isAliased( alias ) ||
mat_.isAliased( alias ) );
350 template<
typename VT1 >
357 if( rhs.mat_.rows() == 0UL ) {
361 else if( rhs.mat_.columns() == 0UL ) {
375 TDVecTDMatMultExpr::selectDefaultAssignKernel( ~lhs, x, A );
377 TDVecTDMatMultExpr::selectBlasAssignKernel( ~lhs, x, A );
396 template<
typename VT1
400 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
421 template<
typename VT1
424 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
425 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
427 typedef IntrinsicTrait<ElementType> IT;
429 const size_t M( A.rows() );
430 const size_t N( A.columns() );
434 for( ; (j+8UL) <= N; j+=8UL ) {
435 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
436 for(
size_t i=0UL; i<M; i+=IT::size ) {
438 xmm1 = xmm1 + x1 * A.get(i,j );
439 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
440 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
441 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
442 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
443 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
444 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
445 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
448 y[j+1UL] =
sum( xmm2 );
449 y[j+2UL] =
sum( xmm3 );
450 y[j+3UL] =
sum( xmm4 );
451 y[j+4UL] =
sum( xmm5 );
452 y[j+5UL] =
sum( xmm6 );
453 y[j+6UL] =
sum( xmm7 );
454 y[j+7UL] =
sum( xmm8 );
456 for( ; (j+4UL) <= N; j+=4UL ) {
458 for(
size_t i=0UL; i<M; i+=IT::size ) {
460 xmm1 = xmm1 + x1 * A.get(i,j );
461 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
462 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
463 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
466 y[j+1UL] =
sum( xmm2 );
467 y[j+2UL] =
sum( xmm3 );
468 y[j+3UL] =
sum( xmm4 );
470 for( ; (j+3UL) <= N; j+=3UL ) {
472 for(
size_t i=0UL; i<M; i+=IT::size ) {
474 xmm1 = xmm1 + x1 * A.get(i,j );
475 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
476 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
479 y[j+1UL] =
sum( xmm2 );
480 y[j+2UL] =
sum( xmm3 );
482 for( ; (j+2UL) <= N; j+=2UL ) {
484 for(
size_t i=0UL; i<M; i+=IT::size ) {
486 xmm1 = xmm1 + x1 * A.get(i,j );
487 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
490 y[j+1UL] =
sum( xmm2 );
494 for(
size_t i=0UL; i<M; i+=IT::size ) {
495 xmm1 = xmm1 + A.get(i,j) * x.get(i);
517 template<
typename VT1
520 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
521 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
523 selectDefaultAssignKernel( y, x, A );
543 template<
typename VT1
546 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
547 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
549 using boost::numeric_cast;
555 const int M ( numeric_cast<int>( A.rows() ) );
556 const int N ( numeric_cast<int>( A.columns() ) );
557 const int lda( numeric_cast<int>( A.spacing() ) );
559 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
560 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
581 template<
typename VT1
584 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
585 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
587 using boost::numeric_cast;
593 const int M ( numeric_cast<int>( A.rows() ) );
594 const int N ( numeric_cast<int>( A.columns() ) );
595 const int lda( numeric_cast<int>( A.spacing() ) );
597 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
598 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
619 template<
typename VT1
622 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
623 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
625 using boost::numeric_cast;
634 const int M ( numeric_cast<int>( A.rows() ) );
635 const int N ( numeric_cast<int>( A.columns() ) );
636 const int lda( numeric_cast<int>( A.spacing() ) );
637 const complex<float> alpha( 1.0F, 0.0F );
638 const complex<float> beta ( 0.0F, 0.0F );
640 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
641 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
662 template<
typename VT1
665 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
666 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
668 using boost::numeric_cast;
677 const int M ( numeric_cast<int>( A.rows() ) );
678 const int N ( numeric_cast<int>( A.columns() ) );
679 const int lda( numeric_cast<int>( A.spacing() ) );
680 const complex<double> alpha( 1.0, 0.0 );
681 const complex<double> beta ( 0.0, 0.0 );
683 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
684 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
703 template<
typename VT1 >
733 template<
typename VT1 >
740 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
752 if( ( IsComputation<MT>::value && !evaluate ) ||
754 TDVecTDMatMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A );
756 TDVecTDMatMultExpr::selectBlasAddAssignKernel( ~lhs, x, A );
775 template<
typename VT1
778 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
779 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
781 y.addAssign( x * A );
800 template<
typename VT1
803 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
804 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
806 typedef IntrinsicTrait<ElementType> IT;
808 const size_t M( A.rows() );
809 const size_t N( A.columns() );
813 for( ; (j+8UL) <= N; j+=8UL ) {
814 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
815 for(
size_t i=0UL; i<M; i+=IT::size ) {
817 xmm1 = xmm1 + x1 * A.get(i,j );
818 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
819 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
820 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
821 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
822 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
823 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
824 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
826 y[j ] +=
sum( xmm1 );
827 y[j+1UL] +=
sum( xmm2 );
828 y[j+2UL] +=
sum( xmm3 );
829 y[j+3UL] +=
sum( xmm4 );
830 y[j+4UL] +=
sum( xmm5 );
831 y[j+5UL] +=
sum( xmm6 );
832 y[j+6UL] +=
sum( xmm7 );
833 y[j+7UL] +=
sum( xmm8 );
835 for( ; (j+4UL) <= N; j+=4UL ) {
837 for(
size_t i=0UL; i<M; i+=IT::size ) {
839 xmm1 = xmm1 + x1 * A.get(i,j );
840 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
841 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
842 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
844 y[j ] +=
sum( xmm1 );
845 y[j+1UL] +=
sum( xmm2 );
846 y[j+2UL] +=
sum( xmm3 );
847 y[j+3UL] +=
sum( xmm4 );
849 for( ; (j+3UL) <= N; j+=3UL ) {
851 for(
size_t i=0UL; i<M; i+=IT::size ) {
853 xmm1 = xmm1 + x1 * A.get(i,j );
854 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
855 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
857 y[j ] +=
sum( xmm1 );
858 y[j+1UL] +=
sum( xmm2 );
859 y[j+2UL] +=
sum( xmm3 );
861 for( ; (j+2UL) <= N; j+=2UL ) {
863 for(
size_t i=0UL; i<M; i+=IT::size ) {
865 xmm1 = xmm1 + x1 * A.get(i,j );
866 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
868 y[j ] +=
sum( xmm1 );
869 y[j+1UL] +=
sum( xmm2 );
873 for(
size_t i=0UL; i<M; i+=IT::size ) {
874 xmm1 = xmm1 + A.get(i,j) * x.get(i);
896 template<
typename VT1
899 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
900 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
902 selectDefaultAddAssignKernel( y, x, A );
922 template<
typename VT1
925 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
926 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
928 using boost::numeric_cast;
934 const int M ( numeric_cast<int>( A.rows() ) );
935 const int N ( numeric_cast<int>( A.columns() ) );
936 const int lda( numeric_cast<int>( A.spacing() ) );
938 cblas_sgemv( CblasColMajor, CblasTrans, M, N, 1.0F,
939 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
960 template<
typename VT1
963 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
964 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
966 using boost::numeric_cast;
972 const int M ( numeric_cast<int>( A.rows() ) );
973 const int N ( numeric_cast<int>( A.columns() ) );
974 const int lda( numeric_cast<int>( A.spacing() ) );
976 cblas_dgemv( CblasColMajor, CblasTrans, M, N, 1.0,
977 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
998 template<
typename VT1
1001 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1002 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1004 using boost::numeric_cast;
1013 const int M ( numeric_cast<int>( A.rows() ) );
1014 const int N ( numeric_cast<int>( A.columns() ) );
1015 const int lda( numeric_cast<int>( A.spacing() ) );
1016 const complex<float> alpha( 1.0F, 0.0F );
1017 const complex<float> beta ( 1.0F, 0.0F );
1019 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1020 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1041 template<
typename VT1
1044 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1045 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1047 using boost::numeric_cast;
1056 const int M ( numeric_cast<int>( A.rows() ) );
1057 const int N ( numeric_cast<int>( A.columns() ) );
1058 const int lda( numeric_cast<int>( A.spacing() ) );
1059 const complex<double> alpha( 1.0, 0.0 );
1060 const complex<double> beta ( 1.0, 0.0 );
1062 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1063 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1086 template<
typename VT1 >
1093 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1105 if( ( IsComputation<MT>::value && !evaluate ) ||
1107 TDVecTDMatMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A );
1109 TDVecTDMatMultExpr::selectBlasSubAssignKernel( ~lhs, x, A );
1128 template<
typename VT1
1131 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1132 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1134 y.subAssign( x * A );
1153 template<
typename VT1
1156 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1> >::Type
1157 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1159 typedef IntrinsicTrait<ElementType> IT;
1161 const size_t M( A.rows() );
1162 const size_t N( A.columns() );
1166 for( ; (j+8UL) <= N; j+=8UL ) {
1167 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1168 for(
size_t i=0UL; i<M; i+=IT::size ) {
1170 xmm1 = xmm1 + x1 * A.get(i,j );
1171 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1172 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1173 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1174 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
1175 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
1176 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
1177 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
1179 y[j ] -=
sum( xmm1 );
1180 y[j+1UL] -=
sum( xmm2 );
1181 y[j+2UL] -=
sum( xmm3 );
1182 y[j+3UL] -=
sum( xmm4 );
1183 y[j+4UL] -=
sum( xmm5 );
1184 y[j+5UL] -=
sum( xmm6 );
1185 y[j+6UL] -=
sum( xmm7 );
1186 y[j+7UL] -=
sum( xmm8 );
1188 for( ; (j+4UL) <= N; j+=4UL ) {
1190 for(
size_t i=0UL; i<M; i+=IT::size ) {
1192 xmm1 = xmm1 + x1 * A.get(i,j );
1193 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1194 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1195 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1197 y[j ] -=
sum( xmm1 );
1198 y[j+1UL] -=
sum( xmm2 );
1199 y[j+2UL] -=
sum( xmm3 );
1200 y[j+3UL] -=
sum( xmm4 );
1202 for( ; (j+3UL) <= N; j+=3UL ) {
1204 for(
size_t i=0UL; i<M; i+=IT::size ) {
1206 xmm1 = xmm1 + x1 * A.get(i,j );
1207 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1208 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1210 y[j ] -=
sum( xmm1 );
1211 y[j+1UL] -=
sum( xmm2 );
1212 y[j+2UL] -=
sum( xmm3 );
1214 for( ; (j+2UL) <= N; j+=2UL ) {
1216 for(
size_t i=0UL; i<M; i+=IT::size ) {
1218 xmm1 = xmm1 + x1 * A.get(i,j );
1219 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1221 y[j ] -=
sum( xmm1 );
1222 y[j+1UL] -=
sum( xmm2 );
1226 for(
size_t i=0UL; i<M; i+=IT::size ) {
1227 xmm1 = xmm1 + A.get(i,j) * x.get(i);
1229 y[j] -=
sum( xmm1 );
1249 template<
typename VT1
1252 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1> >::Type
1253 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1255 selectDefaultSubAssignKernel( y, x, A );
1275 template<
typename VT1
1278 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1> >::Type
1279 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1281 using boost::numeric_cast;
1287 const int M ( numeric_cast<int>( A.rows() ) );
1288 const int N ( numeric_cast<int>( A.columns() ) );
1289 const int lda( numeric_cast<int>( A.spacing() ) );
1291 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -1.0F,
1292 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1313 template<
typename VT1
1316 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1> >::Type
1317 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1319 using boost::numeric_cast;
1325 const int M ( numeric_cast<int>( A.rows() ) );
1326 const int N ( numeric_cast<int>( A.columns() ) );
1327 const int lda( numeric_cast<int>( A.spacing() ) );
1329 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -1.0,
1330 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1351 template<
typename VT1
1354 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1355 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1357 using boost::numeric_cast;
1366 const int M ( numeric_cast<int>( A.rows() ) );
1367 const int N ( numeric_cast<int>( A.columns() ) );
1368 const int lda( numeric_cast<int>( A.spacing() ) );
1369 const complex<float> alpha( -1.0F, 0.0F );
1370 const complex<float> beta ( 1.0F, 0.0F );
1372 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1373 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1394 template<
typename VT1
1397 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
1398 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A )
1400 using boost::numeric_cast;
1409 const int M ( numeric_cast<int>( A.rows() ) );
1410 const int N ( numeric_cast<int>( A.columns() ) );
1411 const int lda( numeric_cast<int>( A.spacing() ) );
1412 const complex<double> alpha( -1.0, 0.0 );
1413 const complex<double> beta ( 1.0, 0.0 );
1415 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
1416 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1439 template<
typename VT1 >
1488 template<
typename VT
1492 :
public DenseVector< DVecScalarMultExpr< TDVecTDMatMultExpr<VT,MT>, ST, true >, true >
1493 ,
private VecScalarMultExpr
1494 ,
private Computation
1498 typedef TDVecTDMatMultExpr<VT,MT> VMM;
1499 typedef typename VMM::ResultType RES;
1500 typedef typename VT::ResultType
VRT;
1501 typedef typename MT::ResultType
MRT;
1502 typedef typename VRT::ElementType
VET;
1503 typedef typename MRT::ElementType
MET;
1504 typedef typename VT::CompositeType
VCT;
1505 typedef typename MT::CompositeType
MCT;
1510 enum { evaluate = IsComputation<MT>::value && !MT::vectorizable &&
1511 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1519 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1520 struct UseSinglePrecisionKernel {
1521 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1522 IsFloat<typename T1::ElementType>::value &&
1523 IsFloat<typename T2::ElementType>::value &&
1524 IsFloat<typename T3::ElementType>::value &&
1525 !IsComplex<T4>::value };
1534 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1535 struct UseDoublePrecisionKernel {
1536 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1537 IsDouble<typename T1::ElementType>::value &&
1538 IsDouble<typename T2::ElementType>::value &&
1539 IsDouble<typename T3::ElementType>::value &&
1540 !IsComplex<T4>::value };
1549 template<
typename T1,
typename T2,
typename T3 >
1550 struct UseSinglePrecisionComplexKernel {
1551 typedef complex<float> Type;
1552 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1553 IsSame<typename T1::ElementType,Type>::value &&
1554 IsSame<typename T2::ElementType,Type>::value &&
1555 IsSame<typename T3::ElementType,Type>::value };
1564 template<
typename T1,
typename T2,
typename T3 >
1565 struct UseDoublePrecisionComplexKernel {
1566 typedef complex<double> Type;
1567 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1568 IsSame<typename T1::ElementType,Type>::value &&
1569 IsSame<typename T2::ElementType,Type>::value &&
1570 IsSame<typename T3::ElementType,Type>::value };
1578 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1579 struct UseDefaultKernel {
1580 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1581 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1582 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1583 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1592 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1593 struct UseVectorizedDefaultKernel {
1594 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1595 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1596 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1597 IsSame<typename T1::ElementType,T4>::value &&
1598 IntrinsicTrait<typename T1::ElementType>::addition &&
1599 IntrinsicTrait<typename T1::ElementType>::multiplication };
1605 typedef DVecScalarMultExpr<VMM,ST,true>
This;
1606 typedef typename MultTrait<RES,ST>::Type
ResultType;
1608 typedef typename ResultType::ElementType
ElementType;
1609 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1614 typedef const TDVecTDMatMultExpr<VT,MT>
LeftOperand;
1620 typedef typename SelectType< IsComputation<VT>::value,
const VRT,
VCT >::Type
LT;
1623 typedef typename SelectType< evaluate, const MRT, MCT >::Type
RT;
1628 enum { vectorizable = 0 };
1637 explicit inline DVecScalarMultExpr(
const VMM& vector, ST scalar )
1651 return vector_[index] * scalar_;
1660 inline size_t size()
const {
1661 return vector_.size();
1691 template<
typename T >
1692 inline bool canAlias(
const T* alias )
const {
1693 return vector_.canAlias( alias );
1703 template<
typename T >
1704 inline bool isAliased(
const T* alias )
const {
1705 return vector_.isAliased( alias );
1727 template<
typename VT1
1729 friend inline void assign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
1735 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
1736 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
1738 if( right.rows() == 0UL ) {
1742 else if( right.columns() == 0UL ) {
1754 if( ( IsComputation<MT>::value && !evaluate ) ||
1756 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, x, A, rhs.scalar_ );
1758 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, x, A, rhs.scalar_ );
1776 template<
typename VT1
1780 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1781 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1783 y.assign( x * A * scalar );
1801 template<
typename VT1
1805 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1806 selectDefaultAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1808 typedef IntrinsicTrait<ElementType> IT;
1810 const size_t M( A.rows() );
1811 const size_t N( A.columns() );
1815 for( ; (j+8UL) <= N; j+=8UL ) {
1816 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1817 for(
size_t i=0UL; i<M; i+=IT::size ) {
1819 xmm1 = xmm1 + x1 * A.get(i,j );
1820 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1821 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1822 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1823 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
1824 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
1825 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
1826 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
1828 y[j ] =
sum( xmm1 ) * scalar;
1829 y[j+1UL] =
sum( xmm2 ) * scalar;
1830 y[j+2UL] =
sum( xmm3 ) * scalar;
1831 y[j+3UL] =
sum( xmm4 ) * scalar;
1832 y[j+4UL] =
sum( xmm5 ) * scalar;
1833 y[j+5UL] =
sum( xmm6 ) * scalar;
1834 y[j+6UL] =
sum( xmm7 ) * scalar;
1835 y[j+7UL] =
sum( xmm8 ) * scalar;
1837 for( ; (j+4UL) <= N; j+=4UL ) {
1839 for(
size_t i=0UL; i<M; i+=IT::size ) {
1841 xmm1 = xmm1 + x1 * A.get(i,j );
1842 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1843 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1844 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
1846 y[j ] =
sum( xmm1 ) * scalar;
1847 y[j+1UL] =
sum( xmm2 ) * scalar;
1848 y[j+2UL] =
sum( xmm3 ) * scalar;
1849 y[j+3UL] =
sum( xmm4 ) * scalar;
1851 for( ; (j+3UL) <= N; j+=3UL ) {
1853 for(
size_t i=0UL; i<M; i+=IT::size ) {
1855 xmm1 = xmm1 + x1 * A.get(i,j );
1856 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1857 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
1859 y[j ] =
sum( xmm1 ) * scalar;
1860 y[j+1UL] =
sum( xmm2 ) * scalar;
1861 y[j+2UL] =
sum( xmm3 ) * scalar;
1863 for( ; (j+2UL) <= N; j+=2UL ) {
1865 for(
size_t i=0UL; i<M; i+=IT::size ) {
1867 xmm1 = xmm1 + x1 * A.get(i,j );
1868 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
1870 y[j ] =
sum( xmm1 ) * scalar;
1871 y[j+1UL] =
sum( xmm2 ) * scalar;
1875 for(
size_t i=0UL; i<M; i+=IT::size ) {
1876 xmm1 = xmm1 + A.get(i,j) * x.get(i);
1878 y[j] =
sum( xmm1 ) * scalar;
1896 template<
typename VT1
1900 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
1901 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1903 selectDefaultAssignKernel( y, x, A, scalar );
1922 template<
typename VT1
1926 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1927 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1929 using boost::numeric_cast;
1935 const int M ( numeric_cast<int>( A.rows() ) );
1936 const int N ( numeric_cast<int>( A.columns() ) );
1937 const int lda( numeric_cast<int>( A.spacing() ) );
1939 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
1940 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
1960 template<
typename VT1
1964 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
1965 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
1967 using boost::numeric_cast;
1973 const int M ( numeric_cast<int>( A.rows() ) );
1974 const int N ( numeric_cast<int>( A.columns() ) );
1975 const int lda( numeric_cast<int>( A.spacing() ) );
1977 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
1978 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
1999 template<
typename VT1
2003 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2004 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2006 using boost::numeric_cast;
2015 const int M ( numeric_cast<int>( A.rows() ) );
2016 const int N ( numeric_cast<int>( A.columns() ) );
2017 const int lda( numeric_cast<int>( A.spacing() ) );
2018 const complex<float> alpha( scalar );
2019 const complex<float> beta ( 0.0F, 0.0F );
2021 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2022 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2043 template<
typename VT1
2047 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2048 selectBlasAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2050 using boost::numeric_cast;
2059 const int M ( numeric_cast<int>( A.rows() ) );
2060 const int N ( numeric_cast<int>( A.columns() ) );
2061 const int lda( numeric_cast<int>( A.spacing() ) );
2062 const complex<double> alpha( scalar );
2063 const complex<double> beta ( 0.0, 0.0 );
2065 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2066 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2083 template<
typename VT1
2085 friend inline void assign( SparseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2112 template<
typename VT1
2114 friend inline void addAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2120 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2121 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2123 if( right.rows() == 0UL || right.columns() == 0UL ) {
2135 if( ( IsComputation<MT>::value && !evaluate ) ||
2137 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2139 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, x, A, rhs.scalar_ );
2157 template<
typename VT1
2161 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2162 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2164 y.addAssign( x * A * scalar );
2182 template<
typename VT1
2186 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2187 selectDefaultAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2189 typedef IntrinsicTrait<ElementType> IT;
2191 const size_t M( A.rows() );
2192 const size_t N( A.columns() );
2196 for( ; (j+8UL) <= N; j+=8UL ) {
2197 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2198 for(
size_t i=0UL; i<M; i+=IT::size ) {
2200 xmm1 = xmm1 + x1 * A.get(i,j );
2201 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2202 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2203 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2204 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
2205 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
2206 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
2207 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
2209 y[j ] +=
sum( xmm1 ) * scalar;
2210 y[j+1UL] +=
sum( xmm2 ) * scalar;
2211 y[j+2UL] +=
sum( xmm3 ) * scalar;
2212 y[j+3UL] +=
sum( xmm4 ) * scalar;
2213 y[j+4UL] +=
sum( xmm5 ) * scalar;
2214 y[j+5UL] +=
sum( xmm6 ) * scalar;
2215 y[j+6UL] +=
sum( xmm7 ) * scalar;
2216 y[j+7UL] +=
sum( xmm8 ) * scalar;
2218 for( ; (j+4UL) <= N; j+=4UL ) {
2220 for(
size_t i=0UL; i<M; i+=IT::size ) {
2222 xmm1 = xmm1 + x1 * A.get(i,j );
2223 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2224 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2225 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2227 y[j ] +=
sum( xmm1 ) * scalar;
2228 y[j+1UL] +=
sum( xmm2 ) * scalar;
2229 y[j+2UL] +=
sum( xmm3 ) * scalar;
2230 y[j+3UL] +=
sum( xmm4 ) * scalar;
2232 for( ; (j+3UL) <= N; j+=3UL ) {
2234 for(
size_t i=0UL; i<M; i+=IT::size ) {
2236 xmm1 = xmm1 + x1 * A.get(i,j );
2237 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2238 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2240 y[j ] +=
sum( xmm1 ) * scalar;
2241 y[j+1UL] +=
sum( xmm2 ) * scalar;
2242 y[j+2UL] +=
sum( xmm3 ) * scalar;
2244 for( ; (j+2UL) <= N; j+=2UL ) {
2246 for(
size_t i=0UL; i<M; i+=IT::size ) {
2248 xmm1 = xmm1 + x1 * A.get(i,j );
2249 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2251 y[j ] +=
sum( xmm1 ) * scalar;
2252 y[j+1UL] +=
sum( xmm2 ) * scalar;
2256 for(
size_t i=0UL; i<M; i+=IT::size ) {
2257 xmm1 = xmm1 + A.get(i,j) * x.get(i);
2259 y[j] +=
sum( xmm1 ) * scalar;
2278 template<
typename VT1
2282 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2283 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2285 selectDefaultAddAssignKernel( y, x, A, scalar );
2304 template<
typename VT1
2308 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2309 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2311 using boost::numeric_cast;
2317 const int M ( numeric_cast<int>( A.rows() ) );
2318 const int N ( numeric_cast<int>( A.columns() ) );
2319 const int lda( numeric_cast<int>( A.spacing() ) );
2321 cblas_sgemv( CblasColMajor, CblasTrans, M, N, scalar,
2322 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2342 template<
typename VT1
2346 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2347 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2349 using boost::numeric_cast;
2355 const int M ( numeric_cast<int>( A.rows() ) );
2356 const int N ( numeric_cast<int>( A.columns() ) );
2357 const int lda( numeric_cast<int>( A.spacing() ) );
2359 cblas_dgemv( CblasColMajor, CblasTrans, M, N, scalar,
2360 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2381 template<
typename VT1
2385 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2386 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2388 using boost::numeric_cast;
2397 const int M ( numeric_cast<int>( A.rows() ) );
2398 const int N ( numeric_cast<int>( A.columns() ) );
2399 const int lda( numeric_cast<int>( A.spacing() ) );
2400 const complex<float> alpha( scalar );
2401 const complex<float> beta ( 1.0F, 0.0F );
2403 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2404 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2425 template<
typename VT1
2429 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2430 selectBlasAddAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2432 using boost::numeric_cast;
2441 const int M ( numeric_cast<int>( A.rows() ) );
2442 const int N ( numeric_cast<int>( A.columns() ) );
2443 const int lda( numeric_cast<int>( A.spacing() ) );
2444 const complex<double> alpha( scalar );
2445 const complex<double> beta ( 1.0, 0.0 );
2447 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2448 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2469 template<
typename VT1
2471 friend inline void subAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2477 typename VMM::LeftOperand left ( rhs.vector_.leftOperand() );
2478 typename VMM::RightOperand right( rhs.vector_.rightOperand() );
2480 if( right.rows() == 0UL || right.columns() == 0UL ) {
2492 if( ( IsComputation<MT>::value && !evaluate ) ||
2494 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2496 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, x, A, rhs.scalar_ );
2514 template<
typename VT1
2518 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2519 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2521 y.subAssign( x * A * scalar );
2539 template<
typename VT1
2543 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2544 selectDefaultSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2546 typedef IntrinsicTrait<ElementType> IT;
2548 const size_t M( A.rows() );
2549 const size_t N( A.columns() );
2553 for( ; (j+8UL) <= N; j+=8UL ) {
2554 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2555 for(
size_t i=0UL; i<M; i+=IT::size ) {
2557 xmm1 = xmm1 + x1 * A.get(i,j );
2558 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2559 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2560 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2561 xmm5 = xmm5 + x1 * A.get(i,j+4UL);
2562 xmm6 = xmm6 + x1 * A.get(i,j+5UL);
2563 xmm7 = xmm7 + x1 * A.get(i,j+6UL);
2564 xmm8 = xmm8 + x1 * A.get(i,j+7UL);
2566 y[j ] -=
sum( xmm1 ) * scalar;
2567 y[j+1UL] -=
sum( xmm2 ) * scalar;
2568 y[j+2UL] -=
sum( xmm3 ) * scalar;
2569 y[j+3UL] -=
sum( xmm4 ) * scalar;
2570 y[j+4UL] -=
sum( xmm5 ) * scalar;
2571 y[j+5UL] -=
sum( xmm6 ) * scalar;
2572 y[j+6UL] -=
sum( xmm7 ) * scalar;
2573 y[j+7UL] -=
sum( xmm8 ) * scalar;
2575 for( ; (j+4UL) <= N; j+=4UL ) {
2577 for(
size_t i=0UL; i<M; i+=IT::size ) {
2579 xmm1 = xmm1 + x1 * A.get(i,j );
2580 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2581 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2582 xmm4 = xmm4 + x1 * A.get(i,j+3UL);
2584 y[j ] -=
sum( xmm1 ) * scalar;
2585 y[j+1UL] -=
sum( xmm2 ) * scalar;
2586 y[j+2UL] -=
sum( xmm3 ) * scalar;
2587 y[j+3UL] -=
sum( xmm4 ) * scalar;
2589 for( ; (j+3UL) <= N; j+=3UL ) {
2591 for(
size_t i=0UL; i<M; i+=IT::size ) {
2593 xmm1 = xmm1 + x1 * A.get(i,j );
2594 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2595 xmm3 = xmm3 + x1 * A.get(i,j+2UL);
2597 y[j ] -=
sum( xmm1 ) * scalar;
2598 y[j+1UL] -=
sum( xmm2 ) * scalar;
2599 y[j+2UL] -=
sum( xmm3 ) * scalar;
2601 for( ; (j+2UL) <= N; j+=2UL ) {
2603 for(
size_t i=0UL; i<M; i+=IT::size ) {
2605 xmm1 = xmm1 + x1 * A.get(i,j );
2606 xmm2 = xmm2 + x1 * A.get(i,j+1UL);
2608 y[j ] -=
sum( xmm1 ) * scalar;
2609 y[j+1UL] -=
sum( xmm2 ) * scalar;
2613 for(
size_t i=0UL; i<M; i+=IT::size ) {
2614 xmm1 = xmm1 + A.get(i,j) * x.get(i);
2616 y[j] -=
sum( xmm1 ) * scalar;
2636 template<
typename VT1
2640 static inline typename EnableIf< UseDefaultKernel<VT1,VT2,MT1,ST2> >::Type
2641 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2643 selectDefaultSubAssignKernel( y, x, A, scalar );
2662 template<
typename VT1
2666 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2667 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2669 using boost::numeric_cast;
2675 const int M ( numeric_cast<int>( A.rows() ) );
2676 const int N ( numeric_cast<int>( A.columns() ) );
2677 const int lda( numeric_cast<int>( A.spacing() ) );
2679 cblas_sgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2680 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2700 template<
typename VT1
2704 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,VT2,MT1,ST2> >::Type
2705 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2707 using boost::numeric_cast;
2713 const int M ( numeric_cast<int>( A.rows() ) );
2714 const int N ( numeric_cast<int>( A.columns() ) );
2715 const int lda( numeric_cast<int>( A.spacing() ) );
2717 cblas_dgemv( CblasColMajor, CblasTrans, M, N, -scalar,
2718 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2740 template<
typename VT1
2744 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2745 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2747 using boost::numeric_cast;
2756 const int M ( numeric_cast<int>( A.rows() ) );
2757 const int N ( numeric_cast<int>( A.columns() ) );
2758 const int lda( numeric_cast<int>( A.spacing() ) );
2759 const complex<float> alpha( -scalar );
2760 const complex<float> beta ( 1.0F, 0.0F );
2762 cblas_cgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2763 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2785 template<
typename VT1
2789 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,VT2,MT1> >::Type
2790 selectBlasSubAssignKernel( VT1& y,
const VT2& x,
const MT1& A, ST2 scalar )
2792 using boost::numeric_cast;
2801 const int M ( numeric_cast<int>( A.rows() ) );
2802 const int N ( numeric_cast<int>( A.columns() ) );
2803 const int lda( numeric_cast<int>( A.spacing() ) );
2804 const complex<double> alpha( -scalar );
2805 const complex<double> beta ( 1.0, 0.0 );
2807 cblas_zgemv( CblasColMajor, CblasTrans, M, N, &alpha,
2808 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2829 template<
typename VT1
2831 friend inline void multAssign( DenseVector<VT1,TF>& lhs,
const DVecScalarMultExpr& rhs )
2904 template<
typename T1
2906 inline const typename DisableIf< IsMatMatMultExpr<T2>, TDVecTDMatMultExpr<T1,T2> >::Type
2911 if( (~vec).
size() != (~mat).
rows() )
2912 throw std::invalid_argument(
"Vector and matrix sizes do not match" );