35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
44 #include <boost/cast.hpp>
100 template<
typename MT
102 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
103 ,
private MatVecMultExpr
104 ,
private Computation
133 template<
typename T1,
typename T2,
typename T3 >
134 struct UseSinglePrecisionKernel {
135 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
136 IsFloat<typename T1::ElementType>::value &&
137 IsFloat<typename T2::ElementType>::value &&
138 IsFloat<typename T3::ElementType>::value };
149 template<
typename T1,
typename T2,
typename T3 >
150 struct UseDoublePrecisionKernel {
151 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
152 IsDouble<typename T1::ElementType>::value &&
153 IsDouble<typename T2::ElementType>::value &&
154 IsDouble<typename T3::ElementType>::value };
165 template<
typename T1,
typename T2,
typename T3 >
166 struct UseSinglePrecisionComplexKernel {
167 typedef complex<float> Type;
168 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
169 IsSame<typename T1::ElementType,Type>::value &&
170 IsSame<typename T2::ElementType,Type>::value &&
171 IsSame<typename T3::ElementType,Type>::value };
182 template<
typename T1,
typename T2,
typename T3 >
183 struct UseDoublePrecisionComplexKernel {
184 typedef complex<double> Type;
185 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
186 IsSame<typename T1::ElementType,Type>::value &&
187 IsSame<typename T2::ElementType,Type>::value &&
188 IsSame<typename T3::ElementType,Type>::value };
198 template<
typename T1,
typename T2,
typename T3 >
199 struct UseDefaultKernel {
200 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
201 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
202 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
203 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
214 template<
typename T1,
typename T2,
typename T3 >
215 struct UseVectorizedDefaultKernel {
216 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
217 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
218 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
219 IntrinsicTrait<typename T1::ElementType>::addition &&
220 IntrinsicTrait<typename T1::ElementType>::multiplication };
250 enum { vectorizable = 0 };
253 enum { smpAssignable = 0 };
282 if(
mat_.columns() != 0UL ) {
284 for(
size_t j=1UL; j<
end_; j+=2UL ) {
287 if( end_ <
mat_.columns() ) {
335 template<
typename T >
337 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
347 template<
typename T >
349 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
373 template<
typename VT1 >
380 if( rhs.mat_.rows() == 0UL ) {
383 else if( rhs.mat_.columns() == 0UL ) {
398 TDMatDVecMultExpr::selectDefaultAssignKernel( ~lhs, A, x );
400 TDMatDVecMultExpr::selectBlasAssignKernel( ~lhs, A, x );
419 template<
typename VT1
423 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
425 const size_t M( A.rows() );
426 const size_t N( A.columns() );
429 const size_t iend( M &
size_t(-2) );
431 for(
size_t i=0UL; i<M; ++i ) {
432 y[i] = x[0UL] * A(i,0UL);
434 for(
size_t j=1UL; j<N; ++j ) {
435 for(
size_t i=0UL; i<iend; i+=2UL ) {
436 y[i ] += x[j] * A(i ,j);
437 y[i+1UL] += x[j] * A(i+1UL,j);
440 y[iend] += x[j] * A(iend,j);
461 template<
typename VT1
464 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
465 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
467 typedef IntrinsicTrait<ElementType> IT;
469 const size_t M( A.rows() );
470 const size_t N( A.columns() );
474 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
475 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
476 for(
size_t j=0UL; j<N; ++j ) {
478 xmm1 = xmm1 + A.load(i ,j) * x1;
479 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
480 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
481 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
482 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
483 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
484 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
485 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
488 y.store( i+IT::size , xmm2 );
489 y.store( i+IT::size*2UL, xmm3 );
490 y.store( i+IT::size*3UL, xmm4 );
491 y.store( i+IT::size*4UL, xmm5 );
492 y.store( i+IT::size*5UL, xmm6 );
493 y.store( i+IT::size*6UL, xmm7 );
494 y.store( i+IT::size*7UL, xmm8 );
496 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
498 for(
size_t j=0UL; j<N; ++j ) {
500 xmm1 = xmm1 + A.load(i ,j) * x1;
501 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
502 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
503 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
506 y.store( i+IT::size , xmm2 );
507 y.store( i+IT::size*2UL, xmm3 );
508 y.store( i+IT::size*3UL, xmm4 );
510 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
512 for(
size_t j=0UL; j<N; ++j ) {
514 xmm1 = xmm1 + A.load(i ,j) * x1;
515 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
516 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
519 y.store( i+IT::size , xmm2 );
520 y.store( i+IT::size*2UL, xmm3 );
522 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
524 for(
size_t j=0UL; j<N; ++j ) {
526 xmm1 = xmm1 + A.load(i ,j) * x1;
527 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
530 y.store( i+IT::size, xmm2 );
534 for(
size_t j=0UL; j<N; ++j ) {
535 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
557 template<
typename VT1
560 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
561 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
563 selectDefaultAssignKernel( y, A, x );
583 template<
typename VT1
586 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
587 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
589 using boost::numeric_cast;
595 const int M ( numeric_cast<int>( A.rows() ) );
596 const int N ( numeric_cast<int>( A.columns() ) );
597 const int lda( numeric_cast<int>( A.spacing() ) );
599 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
600 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
621 template<
typename VT1
624 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
625 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
627 using boost::numeric_cast;
633 const int M ( numeric_cast<int>( A.rows() ) );
634 const int N ( numeric_cast<int>( A.columns() ) );
635 const int lda( numeric_cast<int>( A.spacing() ) );
637 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
638 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
659 template<
typename VT1
662 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
663 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
665 using boost::numeric_cast;
674 const int M ( numeric_cast<int>( A.rows() ) );
675 const int N ( numeric_cast<int>( A.columns() ) );
676 const int lda( numeric_cast<int>( A.spacing() ) );
677 const complex<float> alpha( 1.0F, 0.0F );
678 const complex<float> beta ( 0.0F, 0.0F );
680 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
681 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
702 template<
typename VT1
705 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
706 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
708 using boost::numeric_cast;
717 const int M ( numeric_cast<int>( A.rows() ) );
718 const int N ( numeric_cast<int>( A.columns() ) );
719 const int lda( numeric_cast<int>( A.spacing() ) );
720 const complex<double> alpha( 1.0, 0.0 );
721 const complex<double> beta ( 0.0, 0.0 );
723 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
724 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
743 template<
typename VT1 >
773 template<
typename VT1 >
780 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
792 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
794 TDMatDVecMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x );
796 TDMatDVecMultExpr::selectBlasAddAssignKernel( ~lhs, A, x );
815 template<
typename VT1
818 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
819 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
821 const size_t M( A.rows() );
822 const size_t N( A.columns() );
825 const size_t iend( M &
size_t(-2) );
827 for(
size_t j=0UL; j<N; ++j ) {
828 for(
size_t i=0UL; i<iend; i+=2UL ) {
829 y[i ] += x[j] * A(i ,j);
830 y[i+1UL] += x[j] * A(i+1UL,j);
833 y[iend] += x[j] * A(iend,j);
854 template<
typename VT1
857 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
858 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
860 typedef IntrinsicTrait<ElementType> IT;
862 const size_t M( A.rows() );
863 const size_t N( A.columns() );
867 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
876 for(
size_t j=0UL; j<N; ++j ) {
878 xmm1 = xmm1 + A.load(i ,j) * x1;
879 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
880 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
881 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
882 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
883 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
884 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
885 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
888 y.store( i+IT::size , xmm2 );
889 y.store( i+IT::size*2UL, xmm3 );
890 y.store( i+IT::size*3UL, xmm4 );
891 y.store( i+IT::size*4UL, xmm5 );
892 y.store( i+IT::size*5UL, xmm6 );
893 y.store( i+IT::size*6UL, xmm7 );
894 y.store( i+IT::size*7UL, xmm8 );
896 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
901 for(
size_t j=0UL; j<N; ++j ) {
903 xmm1 = xmm1 + A.load(i ,j) * x1;
904 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
905 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
906 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
909 y.store( i+IT::size , xmm2 );
910 y.store( i+IT::size*2UL, xmm3 );
911 y.store( i+IT::size*3UL, xmm4 );
913 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
917 for(
size_t j=0UL; j<N; ++j ) {
919 xmm1 = xmm1 + A.load(i ,j) * x1;
920 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
921 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
924 y.store( i+IT::size , xmm2 );
925 y.store( i+IT::size*2UL, xmm3 );
927 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
930 for(
size_t j=0UL; j<N; ++j ) {
932 xmm1 = xmm1 + A.load(i ,j) * x1;
933 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
936 y.store( i+IT::size, xmm2 );
940 for(
size_t j=0UL; j<N; ++j ) {
941 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
963 template<
typename VT1
966 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
967 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
969 selectDefaultAddAssignKernel( y, A, x );
989 template<
typename VT1
992 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
993 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
995 using boost::numeric_cast;
1001 const int M ( numeric_cast<int>( A.rows() ) );
1002 const int N ( numeric_cast<int>( A.columns() ) );
1003 const int lda( numeric_cast<int>( A.spacing() ) );
1005 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
1006 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1027 template<
typename VT1
1030 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1031 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1033 using boost::numeric_cast;
1039 const int M ( numeric_cast<int>( A.rows() ) );
1040 const int N ( numeric_cast<int>( A.columns() ) );
1041 const int lda( numeric_cast<int>( A.spacing() ) );
1043 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1044 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1065 template<
typename VT1
1068 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1069 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1071 using boost::numeric_cast;
1080 const int M ( numeric_cast<int>( A.rows() ) );
1081 const int N ( numeric_cast<int>( A.columns() ) );
1082 const int lda( numeric_cast<int>( A.spacing() ) );
1083 const complex<float> alpha( 1.0F, 0.0F );
1084 const complex<float> beta ( 1.0F, 0.0F );
1086 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1087 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1108 template<
typename VT1
1111 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1112 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1114 using boost::numeric_cast;
1123 const int M ( numeric_cast<int>( A.rows() ) );
1124 const int N ( numeric_cast<int>( A.columns() ) );
1125 const int lda( numeric_cast<int>( A.spacing() ) );
1126 const complex<double> alpha( 1.0, 0.0 );
1127 const complex<double> beta ( 1.0, 0.0 );
1129 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1130 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1153 template<
typename VT1 >
1160 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1172 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1174 TDMatDVecMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x );
1176 TDMatDVecMultExpr::selectBlasSubAssignKernel( ~lhs, A, x );
1195 template<
typename VT1
1198 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1199 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1201 const size_t M( A.rows() );
1202 const size_t N( A.columns() );
1205 const size_t iend( M &
size_t(-2) );
1207 for(
size_t j=0UL; j<N; ++j ) {
1208 for(
size_t i=0UL; i<iend; i+=2UL ) {
1209 y[i ] -= x[j] * A(i ,j);
1210 y[i+1UL] -= x[j] * A(i+1UL,j);
1213 y[iend] -= x[j] * A(iend,j);
1234 template<
typename VT1
1237 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1238 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1240 typedef IntrinsicTrait<ElementType> IT;
1242 const size_t M( A.rows() );
1243 const size_t N( A.columns() );
1247 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1256 for(
size_t j=0UL; j<N; ++j ) {
1258 xmm1 = xmm1 - A.load(i ,j) * x1;
1259 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1260 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1261 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1262 xmm5 = xmm5 - A.load(i+IT::size*4UL,j) * x1;
1263 xmm6 = xmm6 - A.load(i+IT::size*5UL,j) * x1;
1264 xmm7 = xmm7 - A.load(i+IT::size*6UL,j) * x1;
1265 xmm8 = xmm8 - A.load(i+IT::size*7UL,j) * x1;
1267 y.store( i , xmm1 );
1268 y.store( i+IT::size , xmm2 );
1269 y.store( i+IT::size*2UL, xmm3 );
1270 y.store( i+IT::size*3UL, xmm4 );
1271 y.store( i+IT::size*4UL, xmm5 );
1272 y.store( i+IT::size*5UL, xmm6 );
1273 y.store( i+IT::size*6UL, xmm7 );
1274 y.store( i+IT::size*7UL, xmm8 );
1276 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1281 for(
size_t j=0UL; j<N; ++j ) {
1283 xmm1 = xmm1 - A.load(i ,j) * x1;
1284 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1285 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1286 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1288 y.store( i , xmm1 );
1289 y.store( i+IT::size , xmm2 );
1290 y.store( i+IT::size*2UL, xmm3 );
1291 y.store( i+IT::size*3UL, xmm4 );
1293 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1297 for(
size_t j=0UL; j<N; ++j ) {
1299 xmm1 = xmm1 - A.load(i ,j) * x1;
1300 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1301 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1303 y.store( i , xmm1 );
1304 y.store( i+IT::size , xmm2 );
1305 y.store( i+IT::size*2UL, xmm3 );
1307 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1310 for(
size_t j=0UL; j<N; ++j ) {
1312 xmm1 = xmm1 - A.load(i ,j) * x1;
1313 xmm2 = xmm2 - A.load(i+IT::size,j) * x1;
1315 y.store( i , xmm1 );
1316 y.store( i+IT::size, xmm2 );
1320 for(
size_t j=0UL; j<N; ++j ) {
1321 xmm1 = xmm1 - A.load(i,j) *
set( x[j] );
1343 template<
typename VT1
1346 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1347 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1349 selectDefaultSubAssignKernel( y, A, x );
1369 template<
typename VT1
1372 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1373 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1375 using boost::numeric_cast;
1381 const int M ( numeric_cast<int>( A.rows() ) );
1382 const int N ( numeric_cast<int>( A.columns() ) );
1383 const int lda( numeric_cast<int>( A.spacing() ) );
1385 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1386 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1407 template<
typename VT1
1410 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1411 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1413 using boost::numeric_cast;
1419 const int M ( numeric_cast<int>( A.rows() ) );
1420 const int N ( numeric_cast<int>( A.columns() ) );
1421 const int lda( numeric_cast<int>( A.spacing() ) );
1423 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1424 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1445 template<
typename VT1
1448 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1449 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1451 using boost::numeric_cast;
1460 const int M ( numeric_cast<int>( A.rows() ) );
1461 const int N ( numeric_cast<int>( A.columns() ) );
1462 const int lda( numeric_cast<int>( A.spacing() ) );
1463 const complex<float> alpha( -1.0F, 0.0F );
1464 const complex<float> beta ( 1.0F, 0.0F );
1466 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1467 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1488 template<
typename VT1
1491 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1492 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1494 using boost::numeric_cast;
1503 const int M ( numeric_cast<int>( A.rows() ) );
1504 const int N ( numeric_cast<int>( A.columns() ) );
1505 const int lda( numeric_cast<int>( A.spacing() ) );
1506 const complex<double> alpha( -1.0, 0.0 );
1507 const complex<double> beta ( 1.0, 0.0 );
1509 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1510 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1533 template<
typename VT1 >
1583 template<
typename MT
1587 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1588 ,
private VecScalarMultExpr
1589 ,
private Computation
1593 typedef TDMatDVecMultExpr<MT,VT> MVM;
1605 enum { evaluateMatrix = IsComputation<MT>::value && !MT::vectorizable &&
1606 IsSame<VET,MET>::value && IsBlasCompatible<VET>::value };
1611 enum { evaluateVector = IsComputation<VT>::value };
1619 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1620 struct UseSinglePrecisionKernel {
1621 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1622 IsFloat<typename T1::ElementType>::value &&
1623 IsFloat<typename T2::ElementType>::value &&
1624 IsFloat<typename T3::ElementType>::value &&
1625 !IsComplex<T4>::value };
1634 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1635 struct UseDoublePrecisionKernel {
1636 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1637 IsDouble<typename T1::ElementType>::value &&
1638 IsDouble<typename T2::ElementType>::value &&
1639 IsDouble<typename T3::ElementType>::value &&
1640 !IsComplex<T4>::value };
1649 template<
typename T1,
typename T2,
typename T3 >
1650 struct UseSinglePrecisionComplexKernel {
1651 typedef complex<float> Type;
1652 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1653 IsSame<typename T1::ElementType,Type>::value &&
1654 IsSame<typename T2::ElementType,Type>::value &&
1655 IsSame<typename T3::ElementType,Type>::value };
1664 template<
typename T1,
typename T2,
typename T3 >
1665 struct UseDoublePrecisionComplexKernel {
1666 typedef complex<double> Type;
1667 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1668 IsSame<typename T1::ElementType,Type>::value &&
1669 IsSame<typename T2::ElementType,Type>::value &&
1670 IsSame<typename T3::ElementType,Type>::value };
1678 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1679 struct UseDefaultKernel {
1680 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1681 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1682 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1683 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1692 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1693 struct UseVectorizedDefaultKernel {
1694 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1695 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1696 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1697 IsSame<typename T1::ElementType,T4>::value &&
1698 IntrinsicTrait<typename T1::ElementType>::addition &&
1699 IntrinsicTrait<typename T1::ElementType>::multiplication };
1705 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1706 typedef typename MultTrait<RES,ST>::Type
ResultType;
1709 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1714 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
1720 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
1723 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
1728 enum { vectorizable = 0 };
1731 enum { smpAssignable = 0 };
1740 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1754 return vector_[index] * scalar_;
1763 inline size_t size()
const {
1764 return vector_.size();
1794 template<
typename T >
1795 inline bool canAlias(
const T* alias )
const {
1796 return vector_.canAlias( alias );
1806 template<
typename T >
1807 inline bool isAliased(
const T* alias )
const {
1808 return vector_.isAliased( alias );
1830 template<
typename VT1 >
1831 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
1837 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
1838 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
1840 if( left.rows() == 0UL ) {
1843 else if( left.columns() == 0UL ) {
1856 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1858 DVecScalarMultExpr::selectDefaultAssignKernel( ~lhs, A, x, rhs.scalar_ );
1860 DVecScalarMultExpr::selectBlasAssignKernel( ~lhs, A, x, rhs.scalar_ );
1878 template<
typename VT1
1882 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1883 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1885 const size_t M( A.rows() );
1886 const size_t N( A.columns() );
1889 const size_t iend( M &
size_t(-2) );
1891 for(
size_t i=0UL; i<M; ++i ) {
1892 y[i] = x[0UL] * A(i,0UL);
1894 for(
size_t j=1UL; j<N; ++j ) {
1895 for(
size_t i=0UL; i<iend; i+=2UL ) {
1896 y[i ] += x[j] * A(i ,j);
1897 y[i+1UL] += x[j] * A(i+1UL,j);
1900 y[iend] += x[j] * A(iend,j);
1903 for(
size_t i=0UL; i<M; ++i ) {
1923 template<
typename VT1
1927 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
1928 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
1930 typedef IntrinsicTrait<ElementType> IT;
1932 const size_t M( A.rows() );
1933 const size_t N( A.columns() );
1939 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1940 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1941 for(
size_t j=0UL; j<N; ++j ) {
1943 xmm1 = xmm1 + A.load(i ,j) * x1;
1944 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1945 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1946 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
1947 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
1948 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
1949 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
1950 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
1952 y.store( i , xmm1*factor );
1953 y.store( i+IT::size , xmm2*factor );
1954 y.store( i+IT::size*2UL, xmm3*factor );
1955 y.store( i+IT::size*3UL, xmm4*factor );
1956 y.store( i+IT::size*4UL, xmm5*factor );
1957 y.store( i+IT::size*5UL, xmm6*factor );
1958 y.store( i+IT::size*6UL, xmm7*factor );
1959 y.store( i+IT::size*7UL, xmm8*factor );
1961 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1963 for(
size_t j=0UL; j<N; ++j ) {
1965 xmm1 = xmm1 + A.load(i ,j) * x1;
1966 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1967 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1968 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
1970 y.store( i , xmm1*factor );
1971 y.store( i+IT::size , xmm2*factor );
1972 y.store( i+IT::size*2UL, xmm3*factor );
1973 y.store( i+IT::size*3UL, xmm4*factor );
1975 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1977 for(
size_t j=0UL; j<N; ++j ) {
1979 xmm1 = xmm1 + A.load(i ,j) * x1;
1980 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1981 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1983 y.store( i , xmm1*factor );
1984 y.store( i+IT::size , xmm2*factor );
1985 y.store( i+IT::size*2UL, xmm3*factor );
1987 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1989 for(
size_t j=0UL; j<N; ++j ) {
1991 xmm1 = xmm1 + A.load(i ,j) * x1;
1992 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
1994 y.store( i , xmm1*factor );
1995 y.store( i+IT::size, xmm2*factor );
1999 for(
size_t j=0UL; j<N; ++j ) {
2001 xmm1 = xmm1 + A.load(i,j) * x1;
2003 y.store( i, xmm1*factor );
2022 template<
typename VT1
2026 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2027 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2029 selectDefaultAssignKernel( y, A, x, scalar );
2048 template<
typename VT1
2052 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2053 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2055 using boost::numeric_cast;
2061 const int M ( numeric_cast<int>( A.rows() ) );
2062 const int N ( numeric_cast<int>( A.columns() ) );
2063 const int lda( numeric_cast<int>( A.spacing() ) );
2065 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2066 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2086 template<
typename VT1
2090 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2091 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2093 using boost::numeric_cast;
2099 const int M ( numeric_cast<int>( A.rows() ) );
2100 const int N ( numeric_cast<int>( A.columns() ) );
2101 const int lda( numeric_cast<int>( A.spacing() ) );
2103 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2104 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2124 template<
typename VT1
2128 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2129 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2131 using boost::numeric_cast;
2140 const int M ( numeric_cast<int>( A.rows() ) );
2141 const int N ( numeric_cast<int>( A.columns() ) );
2142 const int lda( numeric_cast<int>( A.spacing() ) );
2143 const complex<float> alpha( scalar );
2144 const complex<float> beta ( 0.0F, 0.0F );
2146 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2147 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2167 template<
typename VT1
2171 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2172 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2174 using boost::numeric_cast;
2183 const int M ( numeric_cast<int>( A.rows() ) );
2184 const int N ( numeric_cast<int>( A.columns() ) );
2185 const int lda( numeric_cast<int>( A.spacing() ) );
2186 const complex<double> alpha( scalar );
2187 const complex<double> beta ( 0.0, 0.0 );
2189 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2190 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2207 template<
typename VT1 >
2208 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2235 template<
typename VT1 >
2236 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2242 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2243 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2245 if( left.rows() == 0UL || left.columns() == 0UL ) {
2257 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2259 DVecScalarMultExpr::selectDefaultAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2261 DVecScalarMultExpr::selectBlasAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2279 template<
typename VT1
2283 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2284 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2286 y.addAssign( A * x * scalar );
2304 template<
typename VT1
2308 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2309 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2311 typedef IntrinsicTrait<ElementType> IT;
2313 const size_t M( A.rows() );
2314 const size_t N( A.columns() );
2320 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2321 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2322 for(
size_t j=0UL; j<N; ++j ) {
2324 xmm1 = xmm1 + A.load(i ,j) * x1;
2325 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2326 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2327 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2328 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2329 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2330 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2331 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2333 y.store( i , y.load(i ) + xmm1*factor );
2334 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2335 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2336 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2337 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) + xmm5*factor );
2338 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) + xmm6*factor );
2339 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) + xmm7*factor );
2340 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) + xmm8*factor );
2342 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2344 for(
size_t j=0UL; j<N; ++j ) {
2346 xmm1 = xmm1 + A.load(i ,j) * x1;
2347 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2348 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2349 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2351 y.store( i , y.load(i ) + xmm1*factor );
2352 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2353 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2354 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2356 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2358 for(
size_t j=0UL; j<N; ++j ) {
2360 xmm1 = xmm1 + A.load(i ,j) * x1;
2361 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2362 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2364 y.store( i , y.load(i ) + xmm1*factor );
2365 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2366 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2368 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2370 for(
size_t j=0UL; j<N; ++j ) {
2372 xmm1 = xmm1 + A.load(i ,j) * x1;
2373 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2375 y.store( i , y.load(i ) + xmm1*factor );
2376 y.store( i+IT::size, y.load(i+IT::size) + xmm2*factor );
2380 for(
size_t j=0UL; j<N; ++j ) {
2381 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
2383 y.store( i, y.load(i) + xmm1*factor );
2402 template<
typename VT1
2406 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2407 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2409 selectDefaultAddAssignKernel( y, A, x, scalar );
2428 template<
typename VT1
2432 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2433 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2435 using boost::numeric_cast;
2441 const int M ( numeric_cast<int>( A.rows() ) );
2442 const int N ( numeric_cast<int>( A.columns() ) );
2443 const int lda( numeric_cast<int>( A.spacing() ) );
2445 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2446 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2466 template<
typename VT1
2470 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2471 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2473 using boost::numeric_cast;
2479 const int M ( numeric_cast<int>( A.rows() ) );
2480 const int N ( numeric_cast<int>( A.columns() ) );
2481 const int lda( numeric_cast<int>( A.spacing() ) );
2483 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2484 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2504 template<
typename VT1
2508 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2509 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2511 using boost::numeric_cast;
2520 const int M ( numeric_cast<int>( A.rows() ) );
2521 const int N ( numeric_cast<int>( A.columns() ) );
2522 const int lda( numeric_cast<int>( A.spacing() ) );
2523 const complex<float> alpha( scalar );
2524 const complex<float> beta ( 1.0F, 0.0F );
2526 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2527 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2547 template<
typename VT1
2551 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2552 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2554 using boost::numeric_cast;
2563 const int M ( numeric_cast<int>( A.rows() ) );
2564 const int N ( numeric_cast<int>( A.columns() ) );
2565 const int lda( numeric_cast<int>( A.spacing() ) );
2566 const complex<double> alpha( scalar );
2567 const complex<double> beta ( 1.0, 0.0 );
2569 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2570 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2591 template<
typename VT1 >
2592 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2598 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2599 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2601 if( left.rows() == 0UL || left.columns() == 0UL ) {
2613 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2615 DVecScalarMultExpr::selectDefaultSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2617 DVecScalarMultExpr::selectBlasSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2635 template<
typename VT1
2639 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2640 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2642 y.subAssign( A * x * scalar );
2660 template<
typename VT1
2664 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2665 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2667 typedef IntrinsicTrait<ElementType> IT;
2669 const size_t M( A.rows() );
2670 const size_t N( A.columns() );
2676 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2677 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2678 for(
size_t j=0UL; j<N; ++j ) {
2680 xmm1 = xmm1 + A.load(i ,j) * x1;
2681 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2682 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2683 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2684 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2685 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2686 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2687 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2689 y.store( i , y.load(i ) - xmm1*factor );
2690 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
2691 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
2692 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
2693 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) - xmm5*factor );
2694 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) - xmm6*factor );
2695 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) - xmm7*factor );
2696 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) - xmm8*factor );
2698 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2700 for(
size_t j=0UL; j<N; ++j ) {
2702 xmm1 = xmm1 + A.load(i ,j) * x1;
2703 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2704 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2705 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2707 y.store( i , y.load(i ) - xmm1*factor );
2708 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
2709 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
2710 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
2712 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2714 for(
size_t j=0UL; j<N; ++j ) {
2716 xmm1 = xmm1 + A.load(i ,j) * x1;
2717 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2718 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2720 y.store( i , y.load(i ) - xmm1*factor );
2721 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
2722 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
2724 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2726 for(
size_t j=0UL; j<N; ++j ) {
2728 xmm1 = xmm1 + A.load(i ,j) * x1;
2729 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2731 y.store( i , y.load(i ) - xmm1*factor );
2732 y.store( i+IT::size, y.load(i+IT::size) - xmm2*factor );
2736 for(
size_t j=0UL; j<N; ++j ) {
2737 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
2739 y.store( i, y.load(i) - xmm1*factor );
2758 template<
typename VT1
2762 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2763 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2765 selectDefaultSubAssignKernel( y, A, x, scalar );
2784 template<
typename VT1
2788 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2789 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2791 using boost::numeric_cast;
2797 const int M ( numeric_cast<int>( A.rows() ) );
2798 const int N ( numeric_cast<int>( A.columns() ) );
2799 const int lda( numeric_cast<int>( A.spacing() ) );
2801 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2802 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2822 template<
typename VT1
2826 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2827 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2829 using boost::numeric_cast;
2835 const int M ( numeric_cast<int>( A.rows() ) );
2836 const int N ( numeric_cast<int>( A.columns() ) );
2837 const int lda( numeric_cast<int>( A.spacing() ) );
2839 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
2840 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2860 template<
typename VT1
2864 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2865 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2867 using boost::numeric_cast;
2876 const int M ( numeric_cast<int>( A.rows() ) );
2877 const int N ( numeric_cast<int>( A.columns() ) );
2878 const int lda( numeric_cast<int>( A.spacing() ) );
2879 const complex<float> alpha( -scalar );
2880 const complex<float> beta ( 1.0F, 0.0F );
2882 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2883 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2903 template<
typename VT1
2907 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2908 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2910 using boost::numeric_cast;
2919 const int M ( numeric_cast<int>( A.rows() ) );
2920 const int N ( numeric_cast<int>( A.columns() ) );
2921 const int lda( numeric_cast<int>( A.spacing() ) );
2922 const complex<double> alpha( -scalar );
2923 const complex<double> beta ( 1.0, 0.0 );
2925 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2926 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2947 template<
typename VT1 >
2948 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3021 template<
typename T1
3023 inline const typename DisableIf< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >::Type
3029 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
3046 template<
typename MT,
typename VT >
3051 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT>::Type, VT >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4512
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:3703
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:242
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:109
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:236
TDMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:262
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:196
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2375
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:248
Header file for the DenseVector base class.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:229
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:355
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:122
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:250
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:239
Header file for the multiplication trait.
Header file for the IsDouble type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:356
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the IsMatMatMultExpr type trait class.
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:108
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:314
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDMatDVecMultExpr.h:357
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:232
Constraints on the storage order of matrix types.
Constraint on the data type.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:233
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2373
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:245
Header file for the EnableIf class template.
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:304
Header file for the IsNumeric type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:348
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:110
Header file for the SubmatrixExprTrait class template.
System settings for the BLAS mode.
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:228
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDVecMultExpr.h:231
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:648
Header file for run time assertion macros.
const size_t TDMATDVECMULT_THRESHOLD
Column-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:68
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:336
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:111
Header file for all intrinsic functionality.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:230
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:277
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:227
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:247
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2370
size_t columns(const Matrix< MT, SO > &m)
Returns the current number of columns of the matrix.
Definition: Matrix.h:154
Header file for basic type definitions.
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:113
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the MatVecMultExpr base class.
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:324
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:112
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.