35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
44 #include <boost/cast.hpp>
103 template<
typename MT
135 template<
typename T1,
typename T2,
typename T3 >
136 struct UseSMPAssignKernel {
137 enum { value = evaluateMatrix || evaluateVector };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
265 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
271 enum { smpAssignable = !evaluateMatrix && !evaluateVector };
300 if(
mat_.columns() != 0UL ) {
302 for(
size_t j=1UL; j<
end_; j+=2UL ) {
305 if( end_ <
mat_.columns() ) {
353 template<
typename T >
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 template<
typename T >
367 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
377 return mat_.isAligned() &&
vec_.isAligned();
414 template<
typename VT1 >
421 if( rhs.
mat_.rows() == 0UL ) {
424 else if( rhs.
mat_.columns() == 0UL ) {
437 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
453 template<
typename VT1
457 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
461 DMatDVecMultExpr::selectDefaultAssignKernel( y, A, x );
463 DMatDVecMultExpr::selectBlasAssignKernel( y, A, x );
479 template<
typename VT1
482 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
483 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
504 template<
typename VT1
507 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
508 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
529 template<
typename VT1
532 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
533 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
535 typedef IntrinsicTrait<ElementType> IT;
537 const size_t M( A.rows() );
538 const size_t N( A.columns() );
542 for( ; (i+8UL) <= M; i+=8UL ) {
543 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
544 for(
size_t j=0UL; j<N; j+=IT::size ) {
546 xmm1 = xmm1 + A.load(i ,j) * x1;
547 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
548 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
549 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
550 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
551 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
552 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
553 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
556 y[i+1UL] =
sum( xmm2 );
557 y[i+2UL] =
sum( xmm3 );
558 y[i+3UL] =
sum( xmm4 );
559 y[i+4UL] =
sum( xmm5 );
560 y[i+5UL] =
sum( xmm6 );
561 y[i+6UL] =
sum( xmm7 );
562 y[i+7UL] =
sum( xmm8 );
564 for( ; (i+4UL) <= M; i+=4UL ) {
566 for(
size_t j=0UL; j<N; j+=IT::size ) {
568 xmm1 = xmm1 + A.load(i ,j) * x1;
569 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
570 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
571 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
574 y[i+1UL] =
sum( xmm2 );
575 y[i+2UL] =
sum( xmm3 );
576 y[i+3UL] =
sum( xmm4 );
578 for( ; (i+3UL) <= M; i+=3UL ) {
580 for(
size_t j=0UL; j<N; j+=IT::size ) {
582 xmm1 = xmm1 + A.load(i ,j) * x1;
583 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
584 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
587 y[i+1UL] =
sum( xmm2 );
588 y[i+2UL] =
sum( xmm3 );
590 for( ; (i+2UL) <= M; i+=2UL ) {
592 for(
size_t j=0UL; j<N; j+=IT::size ) {
594 xmm1 = xmm1 + A.load(i ,j) * x1;
595 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
598 y[i+1UL] =
sum( xmm2 );
602 for(
size_t j=0UL; j<N; j+=IT::size ) {
603 xmm1 = xmm1 + A.load(i,j) * x.load(j);
625 template<
typename VT1
628 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
629 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
631 selectDefaultAssignKernel( y, A, x );
651 template<
typename VT1
654 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
655 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
657 using boost::numeric_cast;
663 const int M ( numeric_cast<int>( A.rows() ) );
664 const int N ( numeric_cast<int>( A.columns() ) );
665 const int lda( numeric_cast<int>( A.spacing() ) );
667 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0F,
668 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
689 template<
typename VT1
692 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
693 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
695 using boost::numeric_cast;
701 const int M ( numeric_cast<int>( A.rows() ) );
702 const int N ( numeric_cast<int>( A.columns() ) );
703 const int lda( numeric_cast<int>( A.spacing() ) );
705 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0,
706 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
727 template<
typename VT1
730 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
731 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
733 using boost::numeric_cast;
742 const int M ( numeric_cast<int>( A.rows() ) );
743 const int N ( numeric_cast<int>( A.columns() ) );
744 const int lda( numeric_cast<int>( A.spacing() ) );
745 const complex<float> alpha( 1.0F, 0.0F );
746 const complex<float> beta ( 0.0F, 0.0F );
748 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
749 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
770 template<
typename VT1
773 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
774 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
776 using boost::numeric_cast;
785 const int M ( numeric_cast<int>( A.rows() ) );
786 const int N ( numeric_cast<int>( A.columns() ) );
787 const int lda( numeric_cast<int>( A.spacing() ) );
788 const complex<double> alpha( 1.0, 0.0 );
789 const complex<double> beta ( 0.0, 0.0 );
791 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
792 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
811 template<
typename VT1 >
841 template<
typename VT1 >
848 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
860 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
876 template<
typename VT1
879 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
880 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
882 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
884 DMatDVecMultExpr::selectDefaultAddAssignKernel( y, A, x );
886 DMatDVecMultExpr::selectBlasAddAssignKernel( y, A, x );
902 template<
typename VT1
905 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
906 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
927 template<
typename VT1
930 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
931 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
933 y.addAssign( A * x );
952 template<
typename VT1
955 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
956 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
958 typedef IntrinsicTrait<ElementType> IT;
960 const size_t M( A.rows() );
961 const size_t N( A.columns() );
965 for( ; (i+8UL) <= M; i+=8UL ) {
966 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
967 for(
size_t j=0UL; j<N; j+=IT::size ) {
969 xmm1 = xmm1 + A.load(i ,j) * x1;
970 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
971 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
972 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
973 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
974 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
975 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
976 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
978 y[i ] +=
sum( xmm1 );
979 y[i+1UL] +=
sum( xmm2 );
980 y[i+2UL] +=
sum( xmm3 );
981 y[i+3UL] +=
sum( xmm4 );
982 y[i+4UL] +=
sum( xmm5 );
983 y[i+5UL] +=
sum( xmm6 );
984 y[i+6UL] +=
sum( xmm7 );
985 y[i+7UL] +=
sum( xmm8 );
987 for( ; (i+4UL) <= M; i+=4UL ) {
989 for(
size_t j=0UL; j<N; j+=IT::size ) {
991 xmm1 = xmm1 + A.load(i ,j) * x1;
992 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
993 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
994 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
996 y[i ] +=
sum( xmm1 );
997 y[i+1UL] +=
sum( xmm2 );
998 y[i+2UL] +=
sum( xmm3 );
999 y[i+3UL] +=
sum( xmm4 );
1001 for( ; (i+3UL) <= M; i+=3UL ) {
1003 for(
size_t j=0UL; j<N; j+=IT::size ) {
1005 xmm1 = xmm1 + A.load(i ,j) * x1;
1006 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1007 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1009 y[i ] +=
sum( xmm1 );
1010 y[i+1UL] +=
sum( xmm2 );
1011 y[i+2UL] +=
sum( xmm3 );
1013 for( ; (i+2UL) <= M; i+=2UL ) {
1015 for(
size_t j=0UL; j<N; j+=IT::size ) {
1017 xmm1 = xmm1 + A.load(i ,j) * x1;
1018 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1020 y[i ] +=
sum( xmm1 );
1021 y[i+1UL] +=
sum( xmm2 );
1025 for(
size_t j=0UL; j<N; j+=IT::size ) {
1026 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1028 y[i] +=
sum( xmm1 );
1048 template<
typename VT1
1051 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1052 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1054 selectDefaultAddAssignKernel( y, A, x );
1074 template<
typename VT1
1077 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1078 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1080 using boost::numeric_cast;
1086 const int M ( numeric_cast<int>( A.rows() ) );
1087 const int N ( numeric_cast<int>( A.columns() ) );
1088 const int lda( numeric_cast<int>( A.spacing() ) );
1090 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0F,
1091 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1112 template<
typename VT1
1115 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1116 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1118 using boost::numeric_cast;
1124 const int M ( numeric_cast<int>( A.rows() ) );
1125 const int N ( numeric_cast<int>( A.columns() ) );
1126 const int lda( numeric_cast<int>( A.spacing() ) );
1128 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0,
1129 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1150 template<
typename VT1
1153 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1154 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1156 using boost::numeric_cast;
1165 const int M ( numeric_cast<int>( A.rows() ) );
1166 const int N ( numeric_cast<int>( A.columns() ) );
1167 const int lda( numeric_cast<int>( A.spacing() ) );
1168 const complex<float> alpha( 1.0F, 0.0F );
1169 const complex<float> beta ( 1.0F, 0.0F );
1171 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1172 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1193 template<
typename VT1
1196 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1197 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1199 using boost::numeric_cast;
1208 const int M ( numeric_cast<int>( A.rows() ) );
1209 const int N ( numeric_cast<int>( A.columns() ) );
1210 const int lda( numeric_cast<int>( A.spacing() ) );
1211 const complex<double> alpha( 1.0, 0.0 );
1212 const complex<double> beta ( 1.0, 0.0 );
1214 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1215 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1238 template<
typename VT1 >
1245 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1257 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1273 template<
typename VT1
1276 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
1277 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1279 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1281 DMatDVecMultExpr::selectDefaultSubAssignKernel( y, A, x );
1283 DMatDVecMultExpr::selectBlasSubAssignKernel( y, A, x );
1299 template<
typename VT1
1302 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
1303 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1324 template<
typename VT1
1327 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1328 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1330 y.subAssign( A * x );
1349 template<
typename VT1
1352 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1353 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1355 typedef IntrinsicTrait<ElementType> IT;
1357 const size_t M( A.rows() );
1358 const size_t N( A.columns() );
1362 for( ; (i+8UL) <= M; i+=8UL ) {
1363 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1364 for(
size_t j=0UL; j<N; j+=IT::size ) {
1366 xmm1 = xmm1 + A.load(i ,j) * x1;
1367 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1368 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1369 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1370 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1371 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1372 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1373 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1375 y[i ] -=
sum( xmm1 );
1376 y[i+1UL] -=
sum( xmm2 );
1377 y[i+2UL] -=
sum( xmm3 );
1378 y[i+3UL] -=
sum( xmm4 );
1379 y[i+4UL] -=
sum( xmm5 );
1380 y[i+5UL] -=
sum( xmm6 );
1381 y[i+6UL] -=
sum( xmm7 );
1382 y[i+7UL] -=
sum( xmm8 );
1384 for( ; (i+4UL) <= M; i+=4UL ) {
1386 for(
size_t j=0UL; j<N; j+=IT::size ) {
1388 xmm1 = xmm1 + A.load(i ,j) * x1;
1389 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1390 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1391 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1393 y[i ] -=
sum( xmm1 );
1394 y[i+1UL] -=
sum( xmm2 );
1395 y[i+2UL] -=
sum( xmm3 );
1396 y[i+3UL] -=
sum( xmm4 );
1398 for( ; (i+3UL) <= M; i+=3UL ) {
1400 for(
size_t j=0UL; j<N; j+=IT::size ) {
1402 xmm1 = xmm1 + A.load(i ,j) * x1;
1403 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1404 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1406 y[i ] -=
sum( xmm1 );
1407 y[i+1UL] -=
sum( xmm2 );
1408 y[i+2UL] -=
sum( xmm3 );
1410 for( ; (i+2UL) <= M; i+=2UL ) {
1412 for(
size_t j=0UL; j<N; j+=IT::size ) {
1414 xmm1 = xmm1 + A.load(i ,j) * x1;
1415 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1417 y[i ] -=
sum( xmm1 );
1418 y[i+1UL] -=
sum( xmm2 );
1422 for(
size_t j=0UL; j<N; j+=IT::size ) {
1423 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1425 y[i] -=
sum( xmm1 );
1445 template<
typename VT1
1448 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1449 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1451 selectDefaultSubAssignKernel( y, A, x );
1471 template<
typename VT1
1474 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1475 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1477 using boost::numeric_cast;
1483 const int M ( numeric_cast<int>( A.rows() ) );
1484 const int N ( numeric_cast<int>( A.columns() ) );
1485 const int lda( numeric_cast<int>( A.spacing() ) );
1487 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, -1.0F,
1488 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1509 template<
typename VT1
1512 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1513 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1515 using boost::numeric_cast;
1521 const int M ( numeric_cast<int>( A.rows() ) );
1522 const int N ( numeric_cast<int>( A.columns() ) );
1523 const int lda( numeric_cast<int>( A.spacing() ) );
1525 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, -1.0,
1526 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1547 template<
typename VT1
1550 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1551 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1553 using boost::numeric_cast;
1562 const int M ( numeric_cast<int>( A.rows() ) );
1563 const int N ( numeric_cast<int>( A.columns() ) );
1564 const int lda( numeric_cast<int>( A.spacing() ) );
1565 const complex<float> alpha( -1.0F, 0.0F );
1566 const complex<float> beta ( 1.0F, 0.0F );
1568 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1569 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1590 template<
typename VT1
1593 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1594 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1596 using boost::numeric_cast;
1605 const int M ( numeric_cast<int>( A.rows() ) );
1606 const int N ( numeric_cast<int>( A.columns() ) );
1607 const int lda( numeric_cast<int>( A.spacing() ) );
1608 const complex<double> alpha( -1.0, 0.0 );
1609 const complex<double> beta ( 1.0, 0.0 );
1611 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1612 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1635 template<
typename VT1 >
1684 template<
typename MT
1688 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
1689 ,
private VecScalarMultExpr
1690 ,
private Computation
1694 typedef DMatDVecMultExpr<MT,VT> MVM;
1706 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1707 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1712 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
1719 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1720 struct UseSMPAssignKernel {
1721 enum { value = evaluateMatrix || evaluateVector };
1730 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1731 struct UseSinglePrecisionKernel {
1732 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1733 IsFloat<typename T1::ElementType>::value &&
1734 IsFloat<typename T2::ElementType>::value &&
1735 IsFloat<typename T3::ElementType>::value &&
1736 !IsComplex<T4>::value };
1745 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1746 struct UseDoublePrecisionKernel {
1747 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1748 IsDouble<typename T1::ElementType>::value &&
1749 IsDouble<typename T2::ElementType>::value &&
1750 IsDouble<typename T3::ElementType>::value &&
1751 !IsComplex<T4>::value };
1760 template<
typename T1,
typename T2,
typename T3 >
1761 struct UseSinglePrecisionComplexKernel {
1762 typedef complex<float> Type;
1763 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1764 IsSame<typename T1::ElementType,Type>::value &&
1765 IsSame<typename T2::ElementType,Type>::value &&
1766 IsSame<typename T3::ElementType,Type>::value };
1775 template<
typename T1,
typename T2,
typename T3 >
1776 struct UseDoublePrecisionComplexKernel {
1777 typedef complex<double> Type;
1778 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1779 IsSame<typename T1::ElementType,Type>::value &&
1780 IsSame<typename T2::ElementType,Type>::value &&
1781 IsSame<typename T3::ElementType,Type>::value };
1789 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1790 struct UseDefaultKernel {
1791 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1792 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1793 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1794 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1803 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1804 struct UseVectorizedDefaultKernel {
1805 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1806 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1807 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1808 IsSame<typename T1::ElementType,T4>::value &&
1809 IntrinsicTrait<typename T1::ElementType>::addition &&
1810 IntrinsicTrait<typename T1::ElementType>::multiplication };
1816 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1817 typedef typename MultTrait<RES,ST>::Type
ResultType;
1820 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1825 typedef const DMatDVecMultExpr<MT,VT>
LeftOperand;
1831 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
1834 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
1839 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
1840 IsSame<MET,VET>::value &&
1841 IsSame<MET,ST>::value &&
1842 IntrinsicTrait<MET>::addition &&
1843 IntrinsicTrait<MET>::multiplication };
1846 enum { smpAssignable = !evaluateMatrix && !evaluateVector };
1855 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1869 return vector_[index] * scalar_;
1878 inline size_t size()
const {
1879 return vector_.size();
1909 template<
typename T >
1910 inline bool canAlias(
const T* alias )
const {
1911 return vector_.canAlias( alias );
1921 template<
typename T >
1922 inline bool isAliased(
const T* alias )
const {
1923 return vector_.isAliased( alias );
1933 return vector_.isAligned();
1943 typename MVM::LeftOperand A( vector_.leftOperand() );
1945 ( IsComputation<MT>::value && !evaluateMatrix ) ||
1969 template<
typename VT1 >
1970 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
1976 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
1977 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
1979 if( left.rows() == 0UL ) {
1982 else if( left.columns() == 0UL ) {
1995 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2010 template<
typename VT1
2014 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2015 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2017 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2019 DVecScalarMultExpr::selectDefaultAssignKernel( y, A, x, scalar );
2021 DVecScalarMultExpr::selectBlasAssignKernel( y, A, x, scalar );
2036 template<
typename VT1
2040 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2041 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2061 template<
typename VT1
2065 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2066 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2068 y.assign( A * x * scalar );
2086 template<
typename VT1
2090 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2091 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2093 typedef IntrinsicTrait<ElementType> IT;
2095 const size_t M( A.rows() );
2096 const size_t N( A.columns() );
2100 for( ; (i+8UL) <= M; i+=8UL ) {
2101 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2102 for(
size_t j=0UL; j<N; j+=IT::size ) {
2104 xmm1 = xmm1 + A.load(i ,j) * x1;
2105 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2106 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2107 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2108 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2109 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2110 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2111 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
2113 y[i ] =
sum( xmm1 ) * scalar;
2114 y[i+1UL] =
sum( xmm2 ) * scalar;
2115 y[i+2UL] =
sum( xmm3 ) * scalar;
2116 y[i+3UL] =
sum( xmm4 ) * scalar;
2117 y[i+4UL] =
sum( xmm5 ) * scalar;
2118 y[i+5UL] =
sum( xmm6 ) * scalar;
2119 y[i+6UL] =
sum( xmm7 ) * scalar;
2120 y[i+7UL] =
sum( xmm8 ) * scalar;
2122 for( ; (i+4UL) <= M; i+=4UL ) {
2124 for(
size_t j=0UL; j<N; j+=IT::size ) {
2126 xmm1 = xmm1 + A.load(i ,j) * x1;
2127 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2128 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2129 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2131 y[i ] =
sum( xmm1 ) * scalar;
2132 y[i+1UL] =
sum( xmm2 ) * scalar;
2133 y[i+2UL] =
sum( xmm3 ) * scalar;
2134 y[i+3UL] =
sum( xmm4 ) * scalar;
2136 for( ; (i+3UL) <= M; i+=3UL ) {
2138 for(
size_t j=0UL; j<N; j+=IT::size ) {
2140 xmm1 = xmm1 + A.load(i ,j) * x1;
2141 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2142 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2144 y[i ] =
sum( xmm1 ) * scalar;
2145 y[i+1UL] =
sum( xmm2 ) * scalar;
2146 y[i+2UL] =
sum( xmm3 ) * scalar;
2148 for( ; (i+2UL) <= M; i+=2UL ) {
2150 for(
size_t j=0UL; j<N; j+=IT::size ) {
2152 xmm1 = xmm1 + A.load(i ,j) * x1;
2153 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2155 y[i ] =
sum( xmm1 ) * scalar;
2156 y[i+1UL] =
sum( xmm2 ) * scalar;
2160 for(
size_t j=0UL; j<N; j+=IT::size ) {
2161 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2163 y[i] =
sum( xmm1 ) * scalar;
2182 template<
typename VT1
2186 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2187 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2189 selectDefaultAssignKernel( y, A, x, scalar );
2208 template<
typename VT1
2212 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2213 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2215 using boost::numeric_cast;
2221 const int M ( numeric_cast<int>( A.rows() ) );
2222 const int N ( numeric_cast<int>( A.columns() ) );
2223 const int lda( numeric_cast<int>( A.spacing() ) );
2225 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2226 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2246 template<
typename VT1
2250 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2251 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2253 using boost::numeric_cast;
2259 const int M ( numeric_cast<int>( A.rows() ) );
2260 const int N ( numeric_cast<int>( A.columns() ) );
2261 const int lda( numeric_cast<int>( A.spacing() ) );
2263 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2264 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2284 template<
typename VT1
2288 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2289 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2291 using boost::numeric_cast;
2300 const int M ( numeric_cast<int>( A.rows() ) );
2301 const int N ( numeric_cast<int>( A.columns() ) );
2302 const int lda( numeric_cast<int>( A.spacing() ) );
2303 const complex<float> alpha( scalar );
2304 const complex<float> beta ( 0.0F, 0.0F );
2306 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2307 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2327 template<
typename VT1
2331 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2332 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2334 using boost::numeric_cast;
2343 const int M ( numeric_cast<int>( A.rows() ) );
2344 const int N ( numeric_cast<int>( A.columns() ) );
2345 const int lda( numeric_cast<int>( A.spacing() ) );
2346 const complex<double> alpha( scalar );
2347 const complex<double> beta ( 0.0, 0.0 );
2349 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2350 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2367 template<
typename VT1 >
2368 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2395 template<
typename VT1 >
2396 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2402 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2403 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2405 if( left.rows() == 0UL || left.columns() == 0UL ) {
2417 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2432 template<
typename VT1
2436 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2437 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2439 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2441 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, A, x, scalar );
2443 DVecScalarMultExpr::selectBlasAddAssignKernel( y, A, x, scalar );
2458 template<
typename VT1
2462 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2463 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2483 template<
typename VT1
2487 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2488 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2490 y.addAssign( A * x * scalar );
2508 template<
typename VT1
2512 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2513 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2515 typedef IntrinsicTrait<ElementType> IT;
2517 const size_t M( A.rows() );
2518 const size_t N( A.columns() );
2522 for( ; (i+8UL) <= M; i+=8UL ) {
2523 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2524 for(
size_t j=0UL; j<N; j+=IT::size ) {
2526 xmm1 = xmm1 + A.load(i ,j) * x1;
2527 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2528 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2529 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2530 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2531 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2532 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2533 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
2535 y[i ] +=
sum( xmm1 ) * scalar;
2536 y[i+1UL] +=
sum( xmm2 ) * scalar;
2537 y[i+2UL] +=
sum( xmm3 ) * scalar;
2538 y[i+3UL] +=
sum( xmm4 ) * scalar;
2539 y[i+4UL] +=
sum( xmm5 ) * scalar;
2540 y[i+5UL] +=
sum( xmm6 ) * scalar;
2541 y[i+6UL] +=
sum( xmm7 ) * scalar;
2542 y[i+7UL] +=
sum( xmm8 ) * scalar;
2544 for( ; (i+4UL) <= M; i+=4UL ) {
2546 for(
size_t j=0UL; j<N; j+=IT::size ) {
2548 xmm1 = xmm1 + A.load(i ,j) * x1;
2549 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2550 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2551 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2553 y[i ] +=
sum( xmm1 ) * scalar;
2554 y[i+1UL] +=
sum( xmm2 ) * scalar;
2555 y[i+2UL] +=
sum( xmm3 ) * scalar;
2556 y[i+3UL] +=
sum( xmm4 ) * scalar;
2558 for( ; (i+3UL) <= M; i+=3UL ) {
2560 for(
size_t j=0UL; j<N; j+=IT::size ) {
2562 xmm1 = xmm1 + A.load(i ,j) * x1;
2563 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2564 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2566 y[i ] +=
sum( xmm1 ) * scalar;
2567 y[i+1UL] +=
sum( xmm2 ) * scalar;
2568 y[i+2UL] +=
sum( xmm3 ) * scalar;
2570 for( ; (i+2UL) <= M; i+=2UL ) {
2572 for(
size_t j=0UL; j<N; j+=IT::size ) {
2574 xmm1 = xmm1 + A.load(i ,j) * x1;
2575 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2577 y[i ] +=
sum( xmm1 ) * scalar;
2578 y[i+1UL] +=
sum( xmm2 ) * scalar;
2582 for(
size_t j=0UL; j<N; j+=IT::size ) {
2583 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2585 y[i] +=
sum( xmm1 ) * scalar;
2604 template<
typename VT1
2608 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2609 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2611 selectDefaultAddAssignKernel( y, A, x, scalar );
2630 template<
typename VT1
2634 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2635 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2637 using boost::numeric_cast;
2643 const int M ( numeric_cast<int>( A.rows() ) );
2644 const int N ( numeric_cast<int>( A.columns() ) );
2645 const int lda( numeric_cast<int>( A.spacing() ) );
2647 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2648 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2668 template<
typename VT1
2672 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2673 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2675 using boost::numeric_cast;
2681 const int M ( numeric_cast<int>( A.rows() ) );
2682 const int N ( numeric_cast<int>( A.columns() ) );
2683 const int lda( numeric_cast<int>( A.spacing() ) );
2685 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2686 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2706 template<
typename VT1
2710 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2711 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2713 using boost::numeric_cast;
2722 const int M ( numeric_cast<int>( A.rows() ) );
2723 const int N ( numeric_cast<int>( A.columns() ) );
2724 const int lda( numeric_cast<int>( A.spacing() ) );
2725 const complex<float> alpha( scalar );
2726 const complex<float> beta ( 1.0F, 0.0F );
2728 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2729 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2749 template<
typename VT1
2753 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2754 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2756 using boost::numeric_cast;
2765 const int M ( numeric_cast<int>( A.rows() ) );
2766 const int N ( numeric_cast<int>( A.columns() ) );
2767 const int lda( numeric_cast<int>( A.spacing() ) );
2768 const complex<double> alpha( scalar );
2769 const complex<double> beta ( 1.0, 0.0 );
2771 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2772 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2793 template<
typename VT1 >
2794 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2800 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2801 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2803 if( left.rows() == 0UL || left.columns() == 0UL ) {
2815 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2830 template<
typename VT1
2834 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2835 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2837 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2839 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, A, x, scalar );
2841 DVecScalarMultExpr::selectBlasSubAssignKernel( y, A, x, scalar );
2856 template<
typename VT1
2860 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2861 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2881 template<
typename VT1
2885 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2886 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2888 y.subAssign( A * x * scalar );
2906 template<
typename VT1
2910 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2911 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2913 typedef IntrinsicTrait<ElementType> IT;
2915 const size_t M( A.rows() );
2916 const size_t N( A.columns() );
2920 for( ; (i+8UL) <= M; i+=8UL ) {
2921 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2922 for(
size_t j=0UL; j<N; j+=IT::size ) {
2924 xmm1 = xmm1 + A.load(i ,j) * x1;
2925 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2926 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2927 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2928 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2929 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2930 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2931 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
2933 y[i ] -=
sum( xmm1 ) * scalar;
2934 y[i+1UL] -=
sum( xmm2 ) * scalar;
2935 y[i+2UL] -=
sum( xmm3 ) * scalar;
2936 y[i+3UL] -=
sum( xmm4 ) * scalar;
2937 y[i+4UL] -=
sum( xmm5 ) * scalar;
2938 y[i+5UL] -=
sum( xmm6 ) * scalar;
2939 y[i+6UL] -=
sum( xmm7 ) * scalar;
2940 y[i+7UL] -=
sum( xmm8 ) * scalar;
2942 for( ; (i+4UL) <= M; i+=4UL ) {
2944 for(
size_t j=0UL; j<N; j+=IT::size ) {
2946 xmm1 = xmm1 + A.load(i ,j) * x1;
2947 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2948 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2949 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2951 y[i ] -=
sum( xmm1 ) * scalar;
2952 y[i+1UL] -=
sum( xmm2 ) * scalar;
2953 y[i+2UL] -=
sum( xmm3 ) * scalar;
2954 y[i+3UL] -=
sum( xmm4 ) * scalar;
2956 for( ; (i+3UL) <= M; i+=3UL ) {
2958 for(
size_t j=0UL; j<N; j+=IT::size ) {
2960 xmm1 = xmm1 + A.load(i ,j) * x1;
2961 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2962 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2964 y[i ] -=
sum( xmm1 ) * scalar;
2965 y[i+1UL] -=
sum( xmm2 ) * scalar;
2966 y[i+2UL] -=
sum( xmm3 ) * scalar;
2968 for( ; (i+2UL) <= M; i+=2UL ) {
2970 for(
size_t j=0UL; j<N; j+=IT::size ) {
2972 xmm1 = xmm1 + A.load(i ,j) * x1;
2973 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2975 y[i ] -=
sum( xmm1 ) * scalar;
2976 y[i+1UL] -=
sum( xmm2 ) * scalar;
2980 for(
size_t j=0UL; j<N; j+=IT::size ) {
2981 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2983 y[i] -=
sum( xmm1 ) * scalar;
3002 template<
typename VT1
3006 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3007 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3009 selectDefaultSubAssignKernel( y, A, x, scalar );
3028 template<
typename VT1
3032 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3033 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3035 using boost::numeric_cast;
3041 const int M ( numeric_cast<int>( A.rows() ) );
3042 const int N ( numeric_cast<int>( A.columns() ) );
3043 const int lda( numeric_cast<int>( A.spacing() ) );
3045 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, -scalar,
3046 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3066 template<
typename VT1
3070 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3071 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3073 using boost::numeric_cast;
3079 const int M ( numeric_cast<int>( A.rows() ) );
3080 const int N ( numeric_cast<int>( A.columns() ) );
3081 const int lda( numeric_cast<int>( A.spacing() ) );
3083 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, -scalar,
3084 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3104 template<
typename VT1
3108 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3109 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3111 using boost::numeric_cast;
3120 const int M ( numeric_cast<int>( A.rows() ) );
3121 const int N ( numeric_cast<int>( A.columns() ) );
3122 const int lda( numeric_cast<int>( A.spacing() ) );
3123 const complex<float> alpha( -scalar );
3124 const complex<float> beta ( 1.0F, 0.0F );
3126 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
3127 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3147 template<
typename VT1
3151 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3152 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3154 using boost::numeric_cast;
3163 const int M ( numeric_cast<int>( A.rows() ) );
3164 const int N ( numeric_cast<int>( A.columns() ) );
3165 const int lda( numeric_cast<int>( A.spacing() ) );
3166 const complex<double> alpha( -scalar );
3167 const complex<double> beta ( 1.0, 0.0 );
3169 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
3170 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3191 template<
typename VT1 >
3192 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3264 template<
typename T1
3266 inline const typename DisableIf< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >::Type
3272 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
3300 template<
typename T1
3303 inline const typename EnableIf< IsMatMatMultExpr<T1>, MultExprTrait<T1,T2> >::Type::Type
3308 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
3323 template<
typename MT,
typename VT,
bool AF >
3328 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type, VT >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:254
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:342
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
DMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:280
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:105
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:111
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:376
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:243
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
Header file for the DenseVector base class.
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:242
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
size_t size() const
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:322
const size_t end_
End of the unrolled calculation loop.
Definition: DMatDVecMultExpr.h:398
Constraint on the data type.
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:116
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:113
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:251
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:260
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the dense vector SMP implementation.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:115
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:332
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:386
Constraint on the data type.
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:257
Constraints on the storage order of matrix types.
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:245
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:396
Header file for the EnableIf class template.
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:397
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:354
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:112
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:248
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:251
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:366
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:244
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
Header file for the sparse vector SMP implementation.
const size_t SMP_DMATDVECMULT_THRESHOLD
SMP row-major dense matrix/dense vector multiplication threshold.This threshold represents the system...
Definition: Thresholds.h:139
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDVecMultExpr.h:246
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:247
size_t columns(const Matrix< MT, SO > &m)
Returns the current number of columns of the matrix.
Definition: Matrix.h:154
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:295
Header file for the MatVecMultExpr base class.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
const size_t DMATDVECMULT_THRESHOLD
Row-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:57
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:114
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.