35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
44 #include <boost/cast.hpp>
102 template<
typename MT
104 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
105 ,
private MatVecMultExpr
106 ,
private Computation
135 template<
typename T1 >
136 struct UseSMPAssign {
137 enum { value = ( evaluateMatrix || evaluateVector ) };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
265 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
271 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
272 !evaluateVector && VT::smpAssignable };
301 if(
mat_.columns() != 0UL ) {
303 for(
size_t j=1UL; j<
end_; j+=2UL ) {
306 if( end_ <
mat_.columns() ) {
354 template<
typename T >
356 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
366 template<
typename T >
368 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
378 return mat_.isAligned() &&
vec_.isAligned();
415 template<
typename VT1 >
422 if( rhs.mat_.rows() == 0UL ) {
425 else if( rhs.mat_.columns() == 0UL ) {
438 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
454 template<
typename VT1
457 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
461 TDMatDVecMultExpr::selectDefaultAssignKernel( y, A, x );
463 TDMatDVecMultExpr::selectBlasAssignKernel( y, A, x );
482 template<
typename VT1
485 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
486 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
488 const size_t M( A.rows() );
489 const size_t N( A.columns() );
492 const size_t iend( M &
size_t(-2) );
494 for(
size_t i=0UL; i<M; ++i ) {
495 y[i] = x[0UL] * A(i,0UL);
497 for(
size_t j=1UL; j<N; ++j ) {
498 for(
size_t i=0UL; i<iend; i+=2UL ) {
499 y[i ] += x[j] * A(i ,j);
500 y[i+1UL] += x[j] * A(i+1UL,j);
503 y[iend] += x[j] * A(iend,j);
524 template<
typename VT1
527 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
528 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
530 typedef IntrinsicTrait<ElementType> IT;
532 const size_t M( A.rows() );
533 const size_t N( A.columns() );
537 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
538 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
539 for(
size_t j=0UL; j<N; ++j ) {
541 xmm1 = xmm1 + A.load(i ,j) * x1;
542 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
543 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
544 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
545 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
546 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
547 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
548 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
551 y.store( i+IT::size , xmm2 );
552 y.store( i+IT::size*2UL, xmm3 );
553 y.store( i+IT::size*3UL, xmm4 );
554 y.store( i+IT::size*4UL, xmm5 );
555 y.store( i+IT::size*5UL, xmm6 );
556 y.store( i+IT::size*6UL, xmm7 );
557 y.store( i+IT::size*7UL, xmm8 );
559 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
561 for(
size_t j=0UL; j<N; ++j ) {
563 xmm1 = xmm1 + A.load(i ,j) * x1;
564 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
565 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
566 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
569 y.store( i+IT::size , xmm2 );
570 y.store( i+IT::size*2UL, xmm3 );
571 y.store( i+IT::size*3UL, xmm4 );
573 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
575 for(
size_t j=0UL; j<N; ++j ) {
577 xmm1 = xmm1 + A.load(i ,j) * x1;
578 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
579 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
582 y.store( i+IT::size , xmm2 );
583 y.store( i+IT::size*2UL, xmm3 );
585 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
587 for(
size_t j=0UL; j<N; ++j ) {
589 xmm1 = xmm1 + A.load(i ,j) * x1;
590 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
593 y.store( i+IT::size, xmm2 );
597 for(
size_t j=0UL; j<N; ++j ) {
598 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
620 template<
typename VT1
623 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
624 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
626 selectDefaultAssignKernel( y, A, x );
646 template<
typename VT1
649 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
650 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
652 using boost::numeric_cast;
658 const int M ( numeric_cast<int>( A.rows() ) );
659 const int N ( numeric_cast<int>( A.columns() ) );
660 const int lda( numeric_cast<int>( A.spacing() ) );
662 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
663 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
684 template<
typename VT1
687 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
688 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
690 using boost::numeric_cast;
696 const int M ( numeric_cast<int>( A.rows() ) );
697 const int N ( numeric_cast<int>( A.columns() ) );
698 const int lda( numeric_cast<int>( A.spacing() ) );
700 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
701 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
722 template<
typename VT1
725 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
726 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
728 using boost::numeric_cast;
737 const int M ( numeric_cast<int>( A.rows() ) );
738 const int N ( numeric_cast<int>( A.columns() ) );
739 const int lda( numeric_cast<int>( A.spacing() ) );
740 const complex<float> alpha( 1.0F, 0.0F );
741 const complex<float> beta ( 0.0F, 0.0F );
743 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
744 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
765 template<
typename VT1
768 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
769 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
771 using boost::numeric_cast;
780 const int M ( numeric_cast<int>( A.rows() ) );
781 const int N ( numeric_cast<int>( A.columns() ) );
782 const int lda( numeric_cast<int>( A.spacing() ) );
783 const complex<double> alpha( 1.0, 0.0 );
784 const complex<double> beta ( 0.0, 0.0 );
786 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
787 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
806 template<
typename VT1 >
836 template<
typename VT1 >
843 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
855 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
871 template<
typename VT1
874 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
876 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
878 TDMatDVecMultExpr::selectDefaultAddAssignKernel( y, A, x );
880 TDMatDVecMultExpr::selectBlasAddAssignKernel( y, A, x );
899 template<
typename VT1
902 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
903 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
905 const size_t M( A.rows() );
906 const size_t N( A.columns() );
909 const size_t iend( M &
size_t(-2) );
911 for(
size_t j=0UL; j<N; ++j ) {
912 for(
size_t i=0UL; i<iend; i+=2UL ) {
913 y[i ] += x[j] * A(i ,j);
914 y[i+1UL] += x[j] * A(i+1UL,j);
917 y[iend] += x[j] * A(iend,j);
938 template<
typename VT1
941 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
942 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
944 typedef IntrinsicTrait<ElementType> IT;
946 const size_t M( A.rows() );
947 const size_t N( A.columns() );
951 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
960 for(
size_t j=0UL; j<N; ++j ) {
962 xmm1 = xmm1 + A.load(i ,j) * x1;
963 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
964 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
965 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
966 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
967 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
968 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
969 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
972 y.store( i+IT::size , xmm2 );
973 y.store( i+IT::size*2UL, xmm3 );
974 y.store( i+IT::size*3UL, xmm4 );
975 y.store( i+IT::size*4UL, xmm5 );
976 y.store( i+IT::size*5UL, xmm6 );
977 y.store( i+IT::size*6UL, xmm7 );
978 y.store( i+IT::size*7UL, xmm8 );
980 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
985 for(
size_t j=0UL; j<N; ++j ) {
987 xmm1 = xmm1 + A.load(i ,j) * x1;
988 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
989 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
990 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
993 y.store( i+IT::size , xmm2 );
994 y.store( i+IT::size*2UL, xmm3 );
995 y.store( i+IT::size*3UL, xmm4 );
997 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1001 for(
size_t j=0UL; j<N; ++j ) {
1003 xmm1 = xmm1 + A.load(i ,j) * x1;
1004 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1005 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1007 y.store( i , xmm1 );
1008 y.store( i+IT::size , xmm2 );
1009 y.store( i+IT::size*2UL, xmm3 );
1011 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1014 for(
size_t j=0UL; j<N; ++j ) {
1016 xmm1 = xmm1 + A.load(i ,j) * x1;
1017 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
1019 y.store( i , xmm1 );
1020 y.store( i+IT::size, xmm2 );
1024 for(
size_t j=0UL; j<N; ++j ) {
1025 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
1047 template<
typename VT1
1050 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1051 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1053 selectDefaultAddAssignKernel( y, A, x );
1073 template<
typename VT1
1076 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1077 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1079 using boost::numeric_cast;
1085 const int M ( numeric_cast<int>( A.rows() ) );
1086 const int N ( numeric_cast<int>( A.columns() ) );
1087 const int lda( numeric_cast<int>( A.spacing() ) );
1089 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
1090 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1111 template<
typename VT1
1114 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1115 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1117 using boost::numeric_cast;
1123 const int M ( numeric_cast<int>( A.rows() ) );
1124 const int N ( numeric_cast<int>( A.columns() ) );
1125 const int lda( numeric_cast<int>( A.spacing() ) );
1127 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1128 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1149 template<
typename VT1
1152 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1153 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1155 using boost::numeric_cast;
1164 const int M ( numeric_cast<int>( A.rows() ) );
1165 const int N ( numeric_cast<int>( A.columns() ) );
1166 const int lda( numeric_cast<int>( A.spacing() ) );
1167 const complex<float> alpha( 1.0F, 0.0F );
1168 const complex<float> beta ( 1.0F, 0.0F );
1170 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1171 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1192 template<
typename VT1
1195 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1196 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1198 using boost::numeric_cast;
1207 const int M ( numeric_cast<int>( A.rows() ) );
1208 const int N ( numeric_cast<int>( A.columns() ) );
1209 const int lda( numeric_cast<int>( A.spacing() ) );
1210 const complex<double> alpha( 1.0, 0.0 );
1211 const complex<double> beta ( 1.0, 0.0 );
1213 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1214 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1237 template<
typename VT1 >
1244 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1256 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1272 template<
typename VT1
1275 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1277 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1279 TDMatDVecMultExpr::selectDefaultSubAssignKernel( y, A, x );
1281 TDMatDVecMultExpr::selectBlasSubAssignKernel( y, A, x );
1300 template<
typename VT1
1303 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1304 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1306 const size_t M( A.rows() );
1307 const size_t N( A.columns() );
1310 const size_t iend( M &
size_t(-2) );
1312 for(
size_t j=0UL; j<N; ++j ) {
1313 for(
size_t i=0UL; i<iend; i+=2UL ) {
1314 y[i ] -= x[j] * A(i ,j);
1315 y[i+1UL] -= x[j] * A(i+1UL,j);
1318 y[iend] -= x[j] * A(iend,j);
1339 template<
typename VT1
1342 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1343 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1345 typedef IntrinsicTrait<ElementType> IT;
1347 const size_t M( A.rows() );
1348 const size_t N( A.columns() );
1352 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1361 for(
size_t j=0UL; j<N; ++j ) {
1363 xmm1 = xmm1 - A.load(i ,j) * x1;
1364 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1365 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1366 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1367 xmm5 = xmm5 - A.load(i+IT::size*4UL,j) * x1;
1368 xmm6 = xmm6 - A.load(i+IT::size*5UL,j) * x1;
1369 xmm7 = xmm7 - A.load(i+IT::size*6UL,j) * x1;
1370 xmm8 = xmm8 - A.load(i+IT::size*7UL,j) * x1;
1372 y.store( i , xmm1 );
1373 y.store( i+IT::size , xmm2 );
1374 y.store( i+IT::size*2UL, xmm3 );
1375 y.store( i+IT::size*3UL, xmm4 );
1376 y.store( i+IT::size*4UL, xmm5 );
1377 y.store( i+IT::size*5UL, xmm6 );
1378 y.store( i+IT::size*6UL, xmm7 );
1379 y.store( i+IT::size*7UL, xmm8 );
1381 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1386 for(
size_t j=0UL; j<N; ++j ) {
1388 xmm1 = xmm1 - A.load(i ,j) * x1;
1389 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1390 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1391 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1393 y.store( i , xmm1 );
1394 y.store( i+IT::size , xmm2 );
1395 y.store( i+IT::size*2UL, xmm3 );
1396 y.store( i+IT::size*3UL, xmm4 );
1398 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1402 for(
size_t j=0UL; j<N; ++j ) {
1404 xmm1 = xmm1 - A.load(i ,j) * x1;
1405 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1406 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1408 y.store( i , xmm1 );
1409 y.store( i+IT::size , xmm2 );
1410 y.store( i+IT::size*2UL, xmm3 );
1412 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1415 for(
size_t j=0UL; j<N; ++j ) {
1417 xmm1 = xmm1 - A.load(i ,j) * x1;
1418 xmm2 = xmm2 - A.load(i+IT::size,j) * x1;
1420 y.store( i , xmm1 );
1421 y.store( i+IT::size, xmm2 );
1425 for(
size_t j=0UL; j<N; ++j ) {
1426 xmm1 = xmm1 - A.load(i,j) *
set( x[j] );
1448 template<
typename VT1
1451 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1452 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1454 selectDefaultSubAssignKernel( y, A, x );
1474 template<
typename VT1
1477 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1478 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1480 using boost::numeric_cast;
1486 const int M ( numeric_cast<int>( A.rows() ) );
1487 const int N ( numeric_cast<int>( A.columns() ) );
1488 const int lda( numeric_cast<int>( A.spacing() ) );
1490 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1491 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1512 template<
typename VT1
1515 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1516 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1518 using boost::numeric_cast;
1524 const int M ( numeric_cast<int>( A.rows() ) );
1525 const int N ( numeric_cast<int>( A.columns() ) );
1526 const int lda( numeric_cast<int>( A.spacing() ) );
1528 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1529 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1550 template<
typename VT1
1553 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1554 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1556 using boost::numeric_cast;
1565 const int M ( numeric_cast<int>( A.rows() ) );
1566 const int N ( numeric_cast<int>( A.columns() ) );
1567 const int lda( numeric_cast<int>( A.spacing() ) );
1568 const complex<float> alpha( -1.0F, 0.0F );
1569 const complex<float> beta ( 1.0F, 0.0F );
1571 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1572 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1593 template<
typename VT1
1596 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1597 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1599 using boost::numeric_cast;
1608 const int M ( numeric_cast<int>( A.rows() ) );
1609 const int N ( numeric_cast<int>( A.columns() ) );
1610 const int lda( numeric_cast<int>( A.spacing() ) );
1611 const complex<double> alpha( -1.0, 0.0 );
1612 const complex<double> beta ( 1.0, 0.0 );
1614 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1615 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1638 template<
typename VT1 >
1674 template<
typename VT1 >
1675 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1682 if( rhs.mat_.rows() == 0UL ) {
1685 else if( rhs.mat_.columns() == 0UL ) {
1718 template<
typename VT1 >
1719 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1751 template<
typename VT1 >
1752 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1759 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1795 template<
typename VT1 >
1796 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1803 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1839 template<
typename VT1 >
1840 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1890 template<
typename MT
1894 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1895 ,
private VecScalarMultExpr
1896 ,
private Computation
1900 typedef TDMatDVecMultExpr<MT,VT> MVM;
1912 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1913 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1918 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
1926 template<
typename T1 >
1927 struct UseSMPAssign {
1928 enum { value = ( evaluateMatrix || evaluateVector ) };
1937 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1938 struct UseSinglePrecisionKernel {
1939 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1940 IsFloat<typename T1::ElementType>::value &&
1941 IsFloat<typename T2::ElementType>::value &&
1942 IsFloat<typename T3::ElementType>::value &&
1943 !IsComplex<T4>::value };
1952 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1953 struct UseDoublePrecisionKernel {
1954 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1955 IsDouble<typename T1::ElementType>::value &&
1956 IsDouble<typename T2::ElementType>::value &&
1957 IsDouble<typename T3::ElementType>::value &&
1958 !IsComplex<T4>::value };
1967 template<
typename T1,
typename T2,
typename T3 >
1968 struct UseSinglePrecisionComplexKernel {
1969 typedef complex<float> Type;
1970 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1971 IsSame<typename T1::ElementType,Type>::value &&
1972 IsSame<typename T2::ElementType,Type>::value &&
1973 IsSame<typename T3::ElementType,Type>::value };
1982 template<
typename T1,
typename T2,
typename T3 >
1983 struct UseDoublePrecisionComplexKernel {
1984 typedef complex<double> Type;
1985 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1986 IsSame<typename T1::ElementType,Type>::value &&
1987 IsSame<typename T2::ElementType,Type>::value &&
1988 IsSame<typename T3::ElementType,Type>::value };
1996 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1997 struct UseDefaultKernel {
1998 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1999 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
2000 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
2001 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
2010 template<
typename T1,
typename T2,
typename T3,
typename T4 >
2011 struct UseVectorizedDefaultKernel {
2012 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
2013 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
2014 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
2015 IsSame<typename T1::ElementType,T4>::value &&
2016 IntrinsicTrait<typename T1::ElementType>::addition &&
2017 IntrinsicTrait<typename T1::ElementType>::multiplication };
2023 typedef DVecScalarMultExpr<MVM,ST,false>
This;
2024 typedef typename MultTrait<RES,ST>::Type
ResultType;
2027 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
2032 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
2038 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
2041 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
2046 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
2047 IsSame<MET,VET>::value &&
2048 IsSame<MET,ST>::value &&
2049 IntrinsicTrait<MET>::addition &&
2050 IntrinsicTrait<MET>::multiplication };
2053 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
2054 !evaluateVector && VT::smpAssignable };
2063 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2077 return vector_[index] * scalar_;
2086 inline size_t size()
const {
2087 return vector_.size();
2117 template<
typename T >
2118 inline bool canAlias(
const T* alias )
const {
2119 return vector_.canAlias( alias );
2129 template<
typename T >
2130 inline bool isAliased(
const T* alias )
const {
2131 return vector_.isAliased( alias );
2141 return vector_.isAligned();
2151 typename MVM::LeftOperand A( vector_.leftOperand() );
2153 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2177 template<
typename VT1 >
2178 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2184 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2185 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2187 if( left.rows() == 0UL ) {
2190 else if( left.columns() == 0UL ) {
2203 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2218 template<
typename VT1
2222 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2224 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2226 DVecScalarMultExpr::selectDefaultAssignKernel( y, A, x, scalar );
2228 DVecScalarMultExpr::selectBlasAssignKernel( y, A, x, scalar );
2246 template<
typename VT1
2250 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2251 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2253 const size_t M( A.rows() );
2254 const size_t N( A.columns() );
2257 const size_t iend( M &
size_t(-2) );
2259 for(
size_t i=0UL; i<M; ++i ) {
2260 y[i] = x[0UL] * A(i,0UL);
2262 for(
size_t j=1UL; j<N; ++j ) {
2263 for(
size_t i=0UL; i<iend; i+=2UL ) {
2264 y[i ] += x[j] * A(i ,j);
2265 y[i+1UL] += x[j] * A(i+1UL,j);
2268 y[iend] += x[j] * A(iend,j);
2271 for(
size_t i=0UL; i<M; ++i ) {
2291 template<
typename VT1
2295 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2296 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2298 typedef IntrinsicTrait<ElementType> IT;
2300 const size_t M( A.rows() );
2301 const size_t N( A.columns() );
2307 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2308 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2309 for(
size_t j=0UL; j<N; ++j ) {
2311 xmm1 = xmm1 + A.load(i ,j) * x1;
2312 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2313 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2314 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2315 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2316 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2317 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2318 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2320 y.store( i , xmm1*factor );
2321 y.store( i+IT::size , xmm2*factor );
2322 y.store( i+IT::size*2UL, xmm3*factor );
2323 y.store( i+IT::size*3UL, xmm4*factor );
2324 y.store( i+IT::size*4UL, xmm5*factor );
2325 y.store( i+IT::size*5UL, xmm6*factor );
2326 y.store( i+IT::size*6UL, xmm7*factor );
2327 y.store( i+IT::size*7UL, xmm8*factor );
2329 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2331 for(
size_t j=0UL; j<N; ++j ) {
2333 xmm1 = xmm1 + A.load(i ,j) * x1;
2334 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2335 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2336 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2338 y.store( i , xmm1*factor );
2339 y.store( i+IT::size , xmm2*factor );
2340 y.store( i+IT::size*2UL, xmm3*factor );
2341 y.store( i+IT::size*3UL, xmm4*factor );
2343 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2345 for(
size_t j=0UL; j<N; ++j ) {
2347 xmm1 = xmm1 + A.load(i ,j) * x1;
2348 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2349 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2351 y.store( i , xmm1*factor );
2352 y.store( i+IT::size , xmm2*factor );
2353 y.store( i+IT::size*2UL, xmm3*factor );
2355 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2357 for(
size_t j=0UL; j<N; ++j ) {
2359 xmm1 = xmm1 + A.load(i ,j) * x1;
2360 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2362 y.store( i , xmm1*factor );
2363 y.store( i+IT::size, xmm2*factor );
2367 for(
size_t j=0UL; j<N; ++j ) {
2369 xmm1 = xmm1 + A.load(i,j) * x1;
2371 y.store( i, xmm1*factor );
2390 template<
typename VT1
2394 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2395 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2397 selectDefaultAssignKernel( y, A, x, scalar );
2416 template<
typename VT1
2420 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2421 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2423 using boost::numeric_cast;
2429 const int M ( numeric_cast<int>( A.rows() ) );
2430 const int N ( numeric_cast<int>( A.columns() ) );
2431 const int lda( numeric_cast<int>( A.spacing() ) );
2433 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2434 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2454 template<
typename VT1
2458 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2459 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2461 using boost::numeric_cast;
2467 const int M ( numeric_cast<int>( A.rows() ) );
2468 const int N ( numeric_cast<int>( A.columns() ) );
2469 const int lda( numeric_cast<int>( A.spacing() ) );
2471 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2472 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2492 template<
typename VT1
2496 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2497 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2499 using boost::numeric_cast;
2508 const int M ( numeric_cast<int>( A.rows() ) );
2509 const int N ( numeric_cast<int>( A.columns() ) );
2510 const int lda( numeric_cast<int>( A.spacing() ) );
2511 const complex<float> alpha( scalar );
2512 const complex<float> beta ( 0.0F, 0.0F );
2514 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2515 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2535 template<
typename VT1
2539 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2540 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2542 using boost::numeric_cast;
2551 const int M ( numeric_cast<int>( A.rows() ) );
2552 const int N ( numeric_cast<int>( A.columns() ) );
2553 const int lda( numeric_cast<int>( A.spacing() ) );
2554 const complex<double> alpha( scalar );
2555 const complex<double> beta ( 0.0, 0.0 );
2557 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2558 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2575 template<
typename VT1 >
2576 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2603 template<
typename VT1 >
2604 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2610 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2611 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2613 if( left.rows() == 0UL || left.columns() == 0UL ) {
2625 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2640 template<
typename VT1
2644 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2646 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2648 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, A, x, scalar );
2650 DVecScalarMultExpr::selectBlasAddAssignKernel( y, A, x, scalar );
2668 template<
typename VT1
2672 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2673 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2675 y.addAssign( A * x * scalar );
2693 template<
typename VT1
2697 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2698 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2700 typedef IntrinsicTrait<ElementType> IT;
2702 const size_t M( A.rows() );
2703 const size_t N( A.columns() );
2709 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2710 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2711 for(
size_t j=0UL; j<N; ++j ) {
2713 xmm1 = xmm1 + A.load(i ,j) * x1;
2714 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2715 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2716 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2717 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2718 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2719 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2720 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2722 y.store( i , y.load(i ) + xmm1*factor );
2723 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2724 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2725 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2726 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) + xmm5*factor );
2727 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) + xmm6*factor );
2728 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) + xmm7*factor );
2729 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) + xmm8*factor );
2731 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2733 for(
size_t j=0UL; j<N; ++j ) {
2735 xmm1 = xmm1 + A.load(i ,j) * x1;
2736 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2737 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2738 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2740 y.store( i , y.load(i ) + xmm1*factor );
2741 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2742 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2743 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2745 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2747 for(
size_t j=0UL; j<N; ++j ) {
2749 xmm1 = xmm1 + A.load(i ,j) * x1;
2750 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2751 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2753 y.store( i , y.load(i ) + xmm1*factor );
2754 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2755 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2757 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2759 for(
size_t j=0UL; j<N; ++j ) {
2761 xmm1 = xmm1 + A.load(i ,j) * x1;
2762 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2764 y.store( i , y.load(i ) + xmm1*factor );
2765 y.store( i+IT::size, y.load(i+IT::size) + xmm2*factor );
2769 for(
size_t j=0UL; j<N; ++j ) {
2770 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
2772 y.store( i, y.load(i) + xmm1*factor );
2791 template<
typename VT1
2795 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2796 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2798 selectDefaultAddAssignKernel( y, A, x, scalar );
2817 template<
typename VT1
2821 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2822 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2824 using boost::numeric_cast;
2830 const int M ( numeric_cast<int>( A.rows() ) );
2831 const int N ( numeric_cast<int>( A.columns() ) );
2832 const int lda( numeric_cast<int>( A.spacing() ) );
2834 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2835 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2855 template<
typename VT1
2859 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2860 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2862 using boost::numeric_cast;
2868 const int M ( numeric_cast<int>( A.rows() ) );
2869 const int N ( numeric_cast<int>( A.columns() ) );
2870 const int lda( numeric_cast<int>( A.spacing() ) );
2872 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2873 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2893 template<
typename VT1
2897 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2898 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2900 using boost::numeric_cast;
2909 const int M ( numeric_cast<int>( A.rows() ) );
2910 const int N ( numeric_cast<int>( A.columns() ) );
2911 const int lda( numeric_cast<int>( A.spacing() ) );
2912 const complex<float> alpha( scalar );
2913 const complex<float> beta ( 1.0F, 0.0F );
2915 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2916 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2936 template<
typename VT1
2940 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2941 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2943 using boost::numeric_cast;
2952 const int M ( numeric_cast<int>( A.rows() ) );
2953 const int N ( numeric_cast<int>( A.columns() ) );
2954 const int lda( numeric_cast<int>( A.spacing() ) );
2955 const complex<double> alpha( scalar );
2956 const complex<double> beta ( 1.0, 0.0 );
2958 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2959 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2980 template<
typename VT1 >
2981 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2987 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2988 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2990 if( left.rows() == 0UL || left.columns() == 0UL ) {
3002 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
3017 template<
typename VT1
3021 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3023 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
3025 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, A, x, scalar );
3027 DVecScalarMultExpr::selectBlasSubAssignKernel( y, A, x, scalar );
3045 template<
typename VT1
3049 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3050 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3052 y.subAssign( A * x * scalar );
3070 template<
typename VT1
3074 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3075 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3077 typedef IntrinsicTrait<ElementType> IT;
3079 const size_t M( A.rows() );
3080 const size_t N( A.columns() );
3086 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3087 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3088 for(
size_t j=0UL; j<N; ++j ) {
3090 xmm1 = xmm1 + A.load(i ,j) * x1;
3091 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3092 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3093 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
3094 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
3095 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
3096 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
3097 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
3099 y.store( i , y.load(i ) - xmm1*factor );
3100 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3101 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3102 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
3103 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) - xmm5*factor );
3104 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) - xmm6*factor );
3105 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) - xmm7*factor );
3106 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) - xmm8*factor );
3108 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3110 for(
size_t j=0UL; j<N; ++j ) {
3112 xmm1 = xmm1 + A.load(i ,j) * x1;
3113 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3114 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3115 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
3117 y.store( i , y.load(i ) - xmm1*factor );
3118 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3119 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3120 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
3122 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
3124 for(
size_t j=0UL; j<N; ++j ) {
3126 xmm1 = xmm1 + A.load(i ,j) * x1;
3127 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3128 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3130 y.store( i , y.load(i ) - xmm1*factor );
3131 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3132 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3134 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3136 for(
size_t j=0UL; j<N; ++j ) {
3138 xmm1 = xmm1 + A.load(i ,j) * x1;
3139 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
3141 y.store( i , y.load(i ) - xmm1*factor );
3142 y.store( i+IT::size, y.load(i+IT::size) - xmm2*factor );
3146 for(
size_t j=0UL; j<N; ++j ) {
3147 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
3149 y.store( i, y.load(i) - xmm1*factor );
3168 template<
typename VT1
3172 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3173 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3175 selectDefaultSubAssignKernel( y, A, x, scalar );
3194 template<
typename VT1
3198 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3199 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3201 using boost::numeric_cast;
3207 const int M ( numeric_cast<int>( A.rows() ) );
3208 const int N ( numeric_cast<int>( A.columns() ) );
3209 const int lda( numeric_cast<int>( A.spacing() ) );
3211 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
3212 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3232 template<
typename VT1
3236 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3237 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3239 using boost::numeric_cast;
3245 const int M ( numeric_cast<int>( A.rows() ) );
3246 const int N ( numeric_cast<int>( A.columns() ) );
3247 const int lda( numeric_cast<int>( A.spacing() ) );
3249 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
3250 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3270 template<
typename VT1
3274 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3275 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3277 using boost::numeric_cast;
3286 const int M ( numeric_cast<int>( A.rows() ) );
3287 const int N ( numeric_cast<int>( A.columns() ) );
3288 const int lda( numeric_cast<int>( A.spacing() ) );
3289 const complex<float> alpha( -scalar );
3290 const complex<float> beta ( 1.0F, 0.0F );
3292 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
3293 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3313 template<
typename VT1
3317 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3318 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3320 using boost::numeric_cast;
3329 const int M ( numeric_cast<int>( A.rows() ) );
3330 const int N ( numeric_cast<int>( A.columns() ) );
3331 const int lda( numeric_cast<int>( A.spacing() ) );
3332 const complex<double> alpha( -scalar );
3333 const complex<double> beta ( 1.0, 0.0 );
3335 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
3336 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3357 template<
typename VT1 >
3358 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3391 template<
typename VT1 >
3392 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3393 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3399 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3400 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3402 if( left.rows() == 0UL ) {
3405 else if( left.columns() == 0UL ) {
3436 template<
typename VT1 >
3437 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3438 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3467 template<
typename VT1 >
3468 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3469 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3475 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3476 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3478 if( left.rows() == 0UL || left.columns() == 0UL ) {
3512 template<
typename VT1 >
3513 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3514 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3520 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3521 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3523 if( left.rows() == 0UL || left.columns() == 0UL ) {
3558 template<
typename VT1 >
3559 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3560 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3633 template<
typename T1
3635 inline const typename DisableIf< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >::Type
3641 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
3658 template<
typename MT,
typename VT,
bool AF >
3663 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type, VT >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:257
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:111
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:251
TDMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:281
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
void smpMultAssign(DenseVector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:179
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
Header file for the DenseVector base class.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:244
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:397
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:126
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:253
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:254
Header file for the multiplication trait.
Header file for the IsDouble type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:398
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the IsMatMatMultExpr type trait class.
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:110
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:333
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDMatDVecMultExpr.h:399
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:247
Constraints on the storage order of matrix types.
Constraint on the data type.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:248
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:361
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:260
Header file for the EnableIf class template.
Header file for the serial shim.
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:323
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:367
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:112
Header file for the SubmatrixExprTrait class template.
System settings for the BLAS mode.
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:243
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDVecMultExpr.h:246
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
const size_t TDMATDVECMULT_THRESHOLD
Column-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:74
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:355
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
const size_t SMP_TDMATDVECMULT_THRESHOLD
SMP column-major dense matrix/dense vector multiplication threshold.This threshold specifies when a c...
Definition: Thresholds.h:345
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:113
Header file for all intrinsic functionality.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:245
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:296
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:242
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
size_t columns(const Matrix< MT, SO > &m)
Returns the current number of columns of the matrix.
Definition: Matrix.h:170
Header file for basic type definitions.
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:115
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the MatVecMultExpr base class.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:387
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:377
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:343
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:114
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.