35 #ifndef _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_DMATDVECMULTEXPR_H_
44 #include <boost/cast.hpp>
102 template<
typename MT
135 template<
typename T1 >
136 struct UseSMPAssign {
137 enum { value = ( evaluateMatrix || evaluateVector ) };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
265 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
271 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
272 !evaluateVector && VT::smpAssignable };
301 if(
mat_.columns() != 0UL ) {
303 for(
size_t j=1UL; j<
end_; j+=2UL ) {
306 if( end_ <
mat_.columns() ) {
354 template<
typename T >
356 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
366 template<
typename T >
368 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
378 return mat_.isAligned() &&
vec_.isAligned();
415 template<
typename VT1 >
422 if( rhs.
mat_.rows() == 0UL ) {
425 else if( rhs.
mat_.columns() == 0UL ) {
438 DMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
454 template<
typename VT1
457 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
461 DMatDVecMultExpr::selectDefaultAssignKernel( y, A, x );
463 DMatDVecMultExpr::selectBlasAssignKernel( y, A, x );
482 template<
typename VT1
485 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
486 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
507 template<
typename VT1
510 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
511 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
513 typedef IntrinsicTrait<ElementType> IT;
515 const size_t M( A.rows() );
516 const size_t N( A.columns() );
520 for( ; (i+8UL) <= M; i+=8UL ) {
521 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
522 for(
size_t j=0UL; j<N; j+=IT::size ) {
524 xmm1 = xmm1 + A.load(i ,j) * x1;
525 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
526 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
527 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
528 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
529 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
530 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
531 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
534 y[i+1UL] =
sum( xmm2 );
535 y[i+2UL] =
sum( xmm3 );
536 y[i+3UL] =
sum( xmm4 );
537 y[i+4UL] =
sum( xmm5 );
538 y[i+5UL] =
sum( xmm6 );
539 y[i+6UL] =
sum( xmm7 );
540 y[i+7UL] =
sum( xmm8 );
542 for( ; (i+4UL) <= M; i+=4UL ) {
544 for(
size_t j=0UL; j<N; j+=IT::size ) {
546 xmm1 = xmm1 + A.load(i ,j) * x1;
547 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
548 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
549 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
552 y[i+1UL] =
sum( xmm2 );
553 y[i+2UL] =
sum( xmm3 );
554 y[i+3UL] =
sum( xmm4 );
556 for( ; (i+3UL) <= M; i+=3UL ) {
558 for(
size_t j=0UL; j<N; j+=IT::size ) {
560 xmm1 = xmm1 + A.load(i ,j) * x1;
561 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
562 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
565 y[i+1UL] =
sum( xmm2 );
566 y[i+2UL] =
sum( xmm3 );
568 for( ; (i+2UL) <= M; i+=2UL ) {
570 for(
size_t j=0UL; j<N; j+=IT::size ) {
572 xmm1 = xmm1 + A.load(i ,j) * x1;
573 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
576 y[i+1UL] =
sum( xmm2 );
580 for(
size_t j=0UL; j<N; j+=IT::size ) {
581 xmm1 = xmm1 + A.load(i,j) * x.load(j);
603 template<
typename VT1
606 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
607 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
609 selectDefaultAssignKernel( y, A, x );
629 template<
typename VT1
632 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
633 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
635 using boost::numeric_cast;
641 const int M ( numeric_cast<int>( A.rows() ) );
642 const int N ( numeric_cast<int>( A.columns() ) );
643 const int lda( numeric_cast<int>( A.spacing() ) );
645 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0F,
646 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
667 template<
typename VT1
670 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
671 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
673 using boost::numeric_cast;
679 const int M ( numeric_cast<int>( A.rows() ) );
680 const int N ( numeric_cast<int>( A.columns() ) );
681 const int lda( numeric_cast<int>( A.spacing() ) );
683 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0,
684 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
705 template<
typename VT1
708 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
709 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
711 using boost::numeric_cast;
720 const int M ( numeric_cast<int>( A.rows() ) );
721 const int N ( numeric_cast<int>( A.columns() ) );
722 const int lda( numeric_cast<int>( A.spacing() ) );
723 const complex<float> alpha( 1.0F, 0.0F );
724 const complex<float> beta ( 0.0F, 0.0F );
726 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
727 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
748 template<
typename VT1
751 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
752 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
754 using boost::numeric_cast;
763 const int M ( numeric_cast<int>( A.rows() ) );
764 const int N ( numeric_cast<int>( A.columns() ) );
765 const int lda( numeric_cast<int>( A.spacing() ) );
766 const complex<double> alpha( 1.0, 0.0 );
767 const complex<double> beta ( 0.0, 0.0 );
769 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
770 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
789 template<
typename VT1 >
819 template<
typename VT1 >
826 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
838 DMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
854 template<
typename VT1
857 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
859 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
861 DMatDVecMultExpr::selectDefaultAddAssignKernel( y, A, x );
863 DMatDVecMultExpr::selectBlasAddAssignKernel( y, A, x );
882 template<
typename VT1
885 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
886 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
888 y.addAssign( A * x );
907 template<
typename VT1
910 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
911 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
913 typedef IntrinsicTrait<ElementType> IT;
915 const size_t M( A.rows() );
916 const size_t N( A.columns() );
920 for( ; (i+8UL) <= M; i+=8UL ) {
921 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
922 for(
size_t j=0UL; j<N; j+=IT::size ) {
924 xmm1 = xmm1 + A.load(i ,j) * x1;
925 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
926 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
927 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
928 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
929 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
930 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
931 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
933 y[i ] +=
sum( xmm1 );
934 y[i+1UL] +=
sum( xmm2 );
935 y[i+2UL] +=
sum( xmm3 );
936 y[i+3UL] +=
sum( xmm4 );
937 y[i+4UL] +=
sum( xmm5 );
938 y[i+5UL] +=
sum( xmm6 );
939 y[i+6UL] +=
sum( xmm7 );
940 y[i+7UL] +=
sum( xmm8 );
942 for( ; (i+4UL) <= M; i+=4UL ) {
944 for(
size_t j=0UL; j<N; j+=IT::size ) {
946 xmm1 = xmm1 + A.load(i ,j) * x1;
947 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
948 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
949 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
951 y[i ] +=
sum( xmm1 );
952 y[i+1UL] +=
sum( xmm2 );
953 y[i+2UL] +=
sum( xmm3 );
954 y[i+3UL] +=
sum( xmm4 );
956 for( ; (i+3UL) <= M; i+=3UL ) {
958 for(
size_t j=0UL; j<N; j+=IT::size ) {
960 xmm1 = xmm1 + A.load(i ,j) * x1;
961 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
962 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
964 y[i ] +=
sum( xmm1 );
965 y[i+1UL] +=
sum( xmm2 );
966 y[i+2UL] +=
sum( xmm3 );
968 for( ; (i+2UL) <= M; i+=2UL ) {
970 for(
size_t j=0UL; j<N; j+=IT::size ) {
972 xmm1 = xmm1 + A.load(i ,j) * x1;
973 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
975 y[i ] +=
sum( xmm1 );
976 y[i+1UL] +=
sum( xmm2 );
980 for(
size_t j=0UL; j<N; j+=IT::size ) {
981 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1003 template<
typename VT1
1006 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1007 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1009 selectDefaultAddAssignKernel( y, A, x );
1029 template<
typename VT1
1032 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1033 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1035 using boost::numeric_cast;
1041 const int M ( numeric_cast<int>( A.rows() ) );
1042 const int N ( numeric_cast<int>( A.columns() ) );
1043 const int lda( numeric_cast<int>( A.spacing() ) );
1045 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0F,
1046 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1067 template<
typename VT1
1070 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1071 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1073 using boost::numeric_cast;
1079 const int M ( numeric_cast<int>( A.rows() ) );
1080 const int N ( numeric_cast<int>( A.columns() ) );
1081 const int lda( numeric_cast<int>( A.spacing() ) );
1083 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, 1.0,
1084 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1105 template<
typename VT1
1108 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1109 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1111 using boost::numeric_cast;
1120 const int M ( numeric_cast<int>( A.rows() ) );
1121 const int N ( numeric_cast<int>( A.columns() ) );
1122 const int lda( numeric_cast<int>( A.spacing() ) );
1123 const complex<float> alpha( 1.0F, 0.0F );
1124 const complex<float> beta ( 1.0F, 0.0F );
1126 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1127 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1148 template<
typename VT1
1151 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1152 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1154 using boost::numeric_cast;
1163 const int M ( numeric_cast<int>( A.rows() ) );
1164 const int N ( numeric_cast<int>( A.columns() ) );
1165 const int lda( numeric_cast<int>( A.spacing() ) );
1166 const complex<double> alpha( 1.0, 0.0 );
1167 const complex<double> beta ( 1.0, 0.0 );
1169 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1170 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1193 template<
typename VT1 >
1200 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1212 DMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1228 template<
typename VT1
1231 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1233 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1235 DMatDVecMultExpr::selectDefaultSubAssignKernel( y, A, x );
1237 DMatDVecMultExpr::selectBlasSubAssignKernel( y, A, x );
1256 template<
typename VT1
1259 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1260 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1262 y.subAssign( A * x );
1281 template<
typename VT1
1284 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1285 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1287 typedef IntrinsicTrait<ElementType> IT;
1289 const size_t M( A.rows() );
1290 const size_t N( A.columns() );
1294 for( ; (i+8UL) <= M; i+=8UL ) {
1295 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
1296 for(
size_t j=0UL; j<N; j+=IT::size ) {
1298 xmm1 = xmm1 + A.load(i ,j) * x1;
1299 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1300 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1301 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1302 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
1303 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
1304 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
1305 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
1307 y[i ] -=
sum( xmm1 );
1308 y[i+1UL] -=
sum( xmm2 );
1309 y[i+2UL] -=
sum( xmm3 );
1310 y[i+3UL] -=
sum( xmm4 );
1311 y[i+4UL] -=
sum( xmm5 );
1312 y[i+5UL] -=
sum( xmm6 );
1313 y[i+6UL] -=
sum( xmm7 );
1314 y[i+7UL] -=
sum( xmm8 );
1316 for( ; (i+4UL) <= M; i+=4UL ) {
1318 for(
size_t j=0UL; j<N; j+=IT::size ) {
1320 xmm1 = xmm1 + A.load(i ,j) * x1;
1321 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1322 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1323 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
1325 y[i ] -=
sum( xmm1 );
1326 y[i+1UL] -=
sum( xmm2 );
1327 y[i+2UL] -=
sum( xmm3 );
1328 y[i+3UL] -=
sum( xmm4 );
1330 for( ; (i+3UL) <= M; i+=3UL ) {
1332 for(
size_t j=0UL; j<N; j+=IT::size ) {
1334 xmm1 = xmm1 + A.load(i ,j) * x1;
1335 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1336 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
1338 y[i ] -=
sum( xmm1 );
1339 y[i+1UL] -=
sum( xmm2 );
1340 y[i+2UL] -=
sum( xmm3 );
1342 for( ; (i+2UL) <= M; i+=2UL ) {
1344 for(
size_t j=0UL; j<N; j+=IT::size ) {
1346 xmm1 = xmm1 + A.load(i ,j) * x1;
1347 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
1349 y[i ] -=
sum( xmm1 );
1350 y[i+1UL] -=
sum( xmm2 );
1354 for(
size_t j=0UL; j<N; j+=IT::size ) {
1355 xmm1 = xmm1 + A.load(i,j) * x.load(j);
1357 y[i] -=
sum( xmm1 );
1377 template<
typename VT1
1380 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1381 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1383 selectDefaultSubAssignKernel( y, A, x );
1403 template<
typename VT1
1406 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1407 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1409 using boost::numeric_cast;
1415 const int M ( numeric_cast<int>( A.rows() ) );
1416 const int N ( numeric_cast<int>( A.columns() ) );
1417 const int lda( numeric_cast<int>( A.spacing() ) );
1419 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, -1.0F,
1420 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1441 template<
typename VT1
1444 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1445 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1447 using boost::numeric_cast;
1453 const int M ( numeric_cast<int>( A.rows() ) );
1454 const int N ( numeric_cast<int>( A.columns() ) );
1455 const int lda( numeric_cast<int>( A.spacing() ) );
1457 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, -1.0,
1458 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1479 template<
typename VT1
1482 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1483 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1485 using boost::numeric_cast;
1494 const int M ( numeric_cast<int>( A.rows() ) );
1495 const int N ( numeric_cast<int>( A.columns() ) );
1496 const int lda( numeric_cast<int>( A.spacing() ) );
1497 const complex<float> alpha( -1.0F, 0.0F );
1498 const complex<float> beta ( 1.0F, 0.0F );
1500 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1501 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1522 template<
typename VT1
1525 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1526 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1528 using boost::numeric_cast;
1537 const int M ( numeric_cast<int>( A.rows() ) );
1538 const int N ( numeric_cast<int>( A.columns() ) );
1539 const int lda( numeric_cast<int>( A.spacing() ) );
1540 const complex<double> alpha( -1.0, 0.0 );
1541 const complex<double> beta ( 1.0, 0.0 );
1543 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
1544 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1567 template<
typename VT1 >
1603 template<
typename VT1 >
1604 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1611 if( rhs.mat_.rows() == 0UL ) {
1614 else if( rhs.mat_.columns() == 0UL ) {
1647 template<
typename VT1 >
1648 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1680 template<
typename VT1 >
1681 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1688 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1724 template<
typename VT1 >
1725 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1732 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1768 template<
typename VT1 >
1769 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
1818 template<
typename MT
1822 :
public DenseVector< DVecScalarMultExpr< DMatDVecMultExpr<MT,VT>, ST, false >, false >
1823 ,
private VecScalarMultExpr
1824 ,
private Computation
1828 typedef DMatDVecMultExpr<MT,VT> MVM;
1840 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1841 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1846 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<MT>::value };
1854 template<
typename T1 >
1855 struct UseSMPAssign {
1856 enum { value = ( evaluateMatrix || evaluateVector ) };
1865 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1866 struct UseSinglePrecisionKernel {
1867 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1868 IsFloat<typename T1::ElementType>::value &&
1869 IsFloat<typename T2::ElementType>::value &&
1870 IsFloat<typename T3::ElementType>::value &&
1871 !IsComplex<T4>::value };
1880 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1881 struct UseDoublePrecisionKernel {
1882 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1883 IsDouble<typename T1::ElementType>::value &&
1884 IsDouble<typename T2::ElementType>::value &&
1885 IsDouble<typename T3::ElementType>::value &&
1886 !IsComplex<T4>::value };
1895 template<
typename T1,
typename T2,
typename T3 >
1896 struct UseSinglePrecisionComplexKernel {
1897 typedef complex<float> Type;
1898 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1899 IsSame<typename T1::ElementType,Type>::value &&
1900 IsSame<typename T2::ElementType,Type>::value &&
1901 IsSame<typename T3::ElementType,Type>::value };
1910 template<
typename T1,
typename T2,
typename T3 >
1911 struct UseDoublePrecisionComplexKernel {
1912 typedef complex<double> Type;
1913 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1914 IsSame<typename T1::ElementType,Type>::value &&
1915 IsSame<typename T2::ElementType,Type>::value &&
1916 IsSame<typename T3::ElementType,Type>::value };
1924 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1925 struct UseDefaultKernel {
1926 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1927 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1928 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1929 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1938 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1939 struct UseVectorizedDefaultKernel {
1940 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1941 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1942 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1943 IsSame<typename T1::ElementType,T4>::value &&
1944 IntrinsicTrait<typename T1::ElementType>::addition &&
1945 IntrinsicTrait<typename T1::ElementType>::multiplication };
1951 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1952 typedef typename MultTrait<RES,ST>::Type
ResultType;
1955 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1960 typedef const DMatDVecMultExpr<MT,VT>
LeftOperand;
1966 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
1969 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
1974 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
1975 IsSame<MET,VET>::value &&
1976 IsSame<MET,ST>::value &&
1977 IntrinsicTrait<MET>::addition &&
1978 IntrinsicTrait<MET>::multiplication };
1981 enum { smpAssignable = !evaluateMatrix && MT::smpAssignable &&
1982 !evaluateVector && VT::smpAssignable };
1991 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
2005 return vector_[index] * scalar_;
2014 inline size_t size()
const {
2015 return vector_.size();
2045 template<
typename T >
2046 inline bool canAlias(
const T* alias )
const {
2047 return vector_.canAlias( alias );
2057 template<
typename T >
2058 inline bool isAliased(
const T* alias )
const {
2059 return vector_.isAliased( alias );
2069 return vector_.isAligned();
2079 typename MVM::LeftOperand A( vector_.leftOperand() );
2081 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2105 template<
typename VT1 >
2106 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2112 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2113 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2115 if( left.rows() == 0UL ) {
2118 else if( left.columns() == 0UL ) {
2131 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2146 template<
typename VT1
2150 static inline void selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2152 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2154 DVecScalarMultExpr::selectDefaultAssignKernel( y, A, x, scalar );
2156 DVecScalarMultExpr::selectBlasAssignKernel( y, A, x, scalar );
2174 template<
typename VT1
2178 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2179 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2181 y.assign( A * x * scalar );
2199 template<
typename VT1
2203 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2204 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2206 typedef IntrinsicTrait<ElementType> IT;
2208 const size_t M( A.rows() );
2209 const size_t N( A.columns() );
2213 for( ; (i+8UL) <= M; i+=8UL ) {
2214 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2215 for(
size_t j=0UL; j<N; j+=IT::size ) {
2217 xmm1 = xmm1 + A.load(i ,j) * x1;
2218 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2219 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2220 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2221 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2222 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2223 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2224 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
2226 y[i ] =
sum( xmm1 ) * scalar;
2227 y[i+1UL] =
sum( xmm2 ) * scalar;
2228 y[i+2UL] =
sum( xmm3 ) * scalar;
2229 y[i+3UL] =
sum( xmm4 ) * scalar;
2230 y[i+4UL] =
sum( xmm5 ) * scalar;
2231 y[i+5UL] =
sum( xmm6 ) * scalar;
2232 y[i+6UL] =
sum( xmm7 ) * scalar;
2233 y[i+7UL] =
sum( xmm8 ) * scalar;
2235 for( ; (i+4UL) <= M; i+=4UL ) {
2237 for(
size_t j=0UL; j<N; j+=IT::size ) {
2239 xmm1 = xmm1 + A.load(i ,j) * x1;
2240 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2241 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2242 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2244 y[i ] =
sum( xmm1 ) * scalar;
2245 y[i+1UL] =
sum( xmm2 ) * scalar;
2246 y[i+2UL] =
sum( xmm3 ) * scalar;
2247 y[i+3UL] =
sum( xmm4 ) * scalar;
2249 for( ; (i+3UL) <= M; i+=3UL ) {
2251 for(
size_t j=0UL; j<N; j+=IT::size ) {
2253 xmm1 = xmm1 + A.load(i ,j) * x1;
2254 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2255 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2257 y[i ] =
sum( xmm1 ) * scalar;
2258 y[i+1UL] =
sum( xmm2 ) * scalar;
2259 y[i+2UL] =
sum( xmm3 ) * scalar;
2261 for( ; (i+2UL) <= M; i+=2UL ) {
2263 for(
size_t j=0UL; j<N; j+=IT::size ) {
2265 xmm1 = xmm1 + A.load(i ,j) * x1;
2266 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2268 y[i ] =
sum( xmm1 ) * scalar;
2269 y[i+1UL] =
sum( xmm2 ) * scalar;
2273 for(
size_t j=0UL; j<N; j+=IT::size ) {
2274 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2276 y[i] =
sum( xmm1 ) * scalar;
2295 template<
typename VT1
2299 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2300 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2302 selectDefaultAssignKernel( y, A, x, scalar );
2321 template<
typename VT1
2325 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2326 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2328 using boost::numeric_cast;
2334 const int M ( numeric_cast<int>( A.rows() ) );
2335 const int N ( numeric_cast<int>( A.columns() ) );
2336 const int lda( numeric_cast<int>( A.spacing() ) );
2338 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2339 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2359 template<
typename VT1
2363 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2364 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2366 using boost::numeric_cast;
2372 const int M ( numeric_cast<int>( A.rows() ) );
2373 const int N ( numeric_cast<int>( A.columns() ) );
2374 const int lda( numeric_cast<int>( A.spacing() ) );
2376 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2377 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2397 template<
typename VT1
2401 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2402 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2404 using boost::numeric_cast;
2413 const int M ( numeric_cast<int>( A.rows() ) );
2414 const int N ( numeric_cast<int>( A.columns() ) );
2415 const int lda( numeric_cast<int>( A.spacing() ) );
2416 const complex<float> alpha( scalar );
2417 const complex<float> beta ( 0.0F, 0.0F );
2419 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2420 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2440 template<
typename VT1
2444 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2445 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2447 using boost::numeric_cast;
2456 const int M ( numeric_cast<int>( A.rows() ) );
2457 const int N ( numeric_cast<int>( A.columns() ) );
2458 const int lda( numeric_cast<int>( A.spacing() ) );
2459 const complex<double> alpha( scalar );
2460 const complex<double> beta ( 0.0, 0.0 );
2462 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2463 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2480 template<
typename VT1 >
2481 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2508 template<
typename VT1 >
2509 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2515 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2516 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2518 if( left.rows() == 0UL || left.columns() == 0UL ) {
2530 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2545 template<
typename VT1
2549 static inline void selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2551 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2553 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, A, x, scalar );
2555 DVecScalarMultExpr::selectBlasAddAssignKernel( y, A, x, scalar );
2573 template<
typename VT1
2577 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2578 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2580 y.addAssign( A * x * scalar );
2598 template<
typename VT1
2602 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2603 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2605 typedef IntrinsicTrait<ElementType> IT;
2607 const size_t M( A.rows() );
2608 const size_t N( A.columns() );
2612 for( ; (i+8UL) <= M; i+=8UL ) {
2613 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2614 for(
size_t j=0UL; j<N; j+=IT::size ) {
2616 xmm1 = xmm1 + A.load(i ,j) * x1;
2617 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2618 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2619 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2620 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2621 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2622 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2623 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
2625 y[i ] +=
sum( xmm1 ) * scalar;
2626 y[i+1UL] +=
sum( xmm2 ) * scalar;
2627 y[i+2UL] +=
sum( xmm3 ) * scalar;
2628 y[i+3UL] +=
sum( xmm4 ) * scalar;
2629 y[i+4UL] +=
sum( xmm5 ) * scalar;
2630 y[i+5UL] +=
sum( xmm6 ) * scalar;
2631 y[i+6UL] +=
sum( xmm7 ) * scalar;
2632 y[i+7UL] +=
sum( xmm8 ) * scalar;
2634 for( ; (i+4UL) <= M; i+=4UL ) {
2636 for(
size_t j=0UL; j<N; j+=IT::size ) {
2638 xmm1 = xmm1 + A.load(i ,j) * x1;
2639 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2640 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2641 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2643 y[i ] +=
sum( xmm1 ) * scalar;
2644 y[i+1UL] +=
sum( xmm2 ) * scalar;
2645 y[i+2UL] +=
sum( xmm3 ) * scalar;
2646 y[i+3UL] +=
sum( xmm4 ) * scalar;
2648 for( ; (i+3UL) <= M; i+=3UL ) {
2650 for(
size_t j=0UL; j<N; j+=IT::size ) {
2652 xmm1 = xmm1 + A.load(i ,j) * x1;
2653 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2654 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2656 y[i ] +=
sum( xmm1 ) * scalar;
2657 y[i+1UL] +=
sum( xmm2 ) * scalar;
2658 y[i+2UL] +=
sum( xmm3 ) * scalar;
2660 for( ; (i+2UL) <= M; i+=2UL ) {
2662 for(
size_t j=0UL; j<N; j+=IT::size ) {
2664 xmm1 = xmm1 + A.load(i ,j) * x1;
2665 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2667 y[i ] +=
sum( xmm1 ) * scalar;
2668 y[i+1UL] +=
sum( xmm2 ) * scalar;
2672 for(
size_t j=0UL; j<N; j+=IT::size ) {
2673 xmm1 = xmm1 + A.load(i,j) * x.load(j);
2675 y[i] +=
sum( xmm1 ) * scalar;
2694 template<
typename VT1
2698 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2699 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2701 selectDefaultAddAssignKernel( y, A, x, scalar );
2720 template<
typename VT1
2724 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2725 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2727 using boost::numeric_cast;
2733 const int M ( numeric_cast<int>( A.rows() ) );
2734 const int N ( numeric_cast<int>( A.columns() ) );
2735 const int lda( numeric_cast<int>( A.spacing() ) );
2737 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2738 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2758 template<
typename VT1
2762 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2763 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2765 using boost::numeric_cast;
2771 const int M ( numeric_cast<int>( A.rows() ) );
2772 const int N ( numeric_cast<int>( A.columns() ) );
2773 const int lda( numeric_cast<int>( A.spacing() ) );
2775 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, scalar,
2776 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2796 template<
typename VT1
2800 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2801 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2803 using boost::numeric_cast;
2812 const int M ( numeric_cast<int>( A.rows() ) );
2813 const int N ( numeric_cast<int>( A.columns() ) );
2814 const int lda( numeric_cast<int>( A.spacing() ) );
2815 const complex<float> alpha( scalar );
2816 const complex<float> beta ( 1.0F, 0.0F );
2818 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2819 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2839 template<
typename VT1
2843 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2844 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2846 using boost::numeric_cast;
2855 const int M ( numeric_cast<int>( A.rows() ) );
2856 const int N ( numeric_cast<int>( A.columns() ) );
2857 const int lda( numeric_cast<int>( A.spacing() ) );
2858 const complex<double> alpha( scalar );
2859 const complex<double> beta ( 1.0, 0.0 );
2861 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
2862 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2883 template<
typename VT1 >
2884 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2890 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2891 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2893 if( left.rows() == 0UL || left.columns() == 0UL ) {
2905 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2920 template<
typename VT1
2924 static inline void selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2926 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2928 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, A, x, scalar );
2930 DVecScalarMultExpr::selectBlasSubAssignKernel( y, A, x, scalar );
2948 template<
typename VT1
2952 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2953 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2955 y.subAssign( A * x * scalar );
2973 template<
typename VT1
2977 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2978 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2980 typedef IntrinsicTrait<ElementType> IT;
2982 const size_t M( A.rows() );
2983 const size_t N( A.columns() );
2987 for( ; (i+8UL) <= M; i+=8UL ) {
2988 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2989 for(
size_t j=0UL; j<N; j+=IT::size ) {
2991 xmm1 = xmm1 + A.load(i ,j) * x1;
2992 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
2993 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
2994 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
2995 xmm5 = xmm5 + A.load(i+4UL,j) * x1;
2996 xmm6 = xmm6 + A.load(i+5UL,j) * x1;
2997 xmm7 = xmm7 + A.load(i+6UL,j) * x1;
2998 xmm8 = xmm8 + A.load(i+7UL,j) * x1;
3000 y[i ] -=
sum( xmm1 ) * scalar;
3001 y[i+1UL] -=
sum( xmm2 ) * scalar;
3002 y[i+2UL] -=
sum( xmm3 ) * scalar;
3003 y[i+3UL] -=
sum( xmm4 ) * scalar;
3004 y[i+4UL] -=
sum( xmm5 ) * scalar;
3005 y[i+5UL] -=
sum( xmm6 ) * scalar;
3006 y[i+6UL] -=
sum( xmm7 ) * scalar;
3007 y[i+7UL] -=
sum( xmm8 ) * scalar;
3009 for( ; (i+4UL) <= M; i+=4UL ) {
3011 for(
size_t j=0UL; j<N; j+=IT::size ) {
3013 xmm1 = xmm1 + A.load(i ,j) * x1;
3014 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3015 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3016 xmm4 = xmm4 + A.load(i+3UL,j) * x1;
3018 y[i ] -=
sum( xmm1 ) * scalar;
3019 y[i+1UL] -=
sum( xmm2 ) * scalar;
3020 y[i+2UL] -=
sum( xmm3 ) * scalar;
3021 y[i+3UL] -=
sum( xmm4 ) * scalar;
3023 for( ; (i+3UL) <= M; i+=3UL ) {
3025 for(
size_t j=0UL; j<N; j+=IT::size ) {
3027 xmm1 = xmm1 + A.load(i ,j) * x1;
3028 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3029 xmm3 = xmm3 + A.load(i+2UL,j) * x1;
3031 y[i ] -=
sum( xmm1 ) * scalar;
3032 y[i+1UL] -=
sum( xmm2 ) * scalar;
3033 y[i+2UL] -=
sum( xmm3 ) * scalar;
3035 for( ; (i+2UL) <= M; i+=2UL ) {
3037 for(
size_t j=0UL; j<N; j+=IT::size ) {
3039 xmm1 = xmm1 + A.load(i ,j) * x1;
3040 xmm2 = xmm2 + A.load(i+1UL,j) * x1;
3042 y[i ] -=
sum( xmm1 ) * scalar;
3043 y[i+1UL] -=
sum( xmm2 ) * scalar;
3047 for(
size_t j=0UL; j<N; j+=IT::size ) {
3048 xmm1 = xmm1 + A.load(i,j) * x.load(j);
3050 y[i] -=
sum( xmm1 ) * scalar;
3069 template<
typename VT1
3073 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3074 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3076 selectDefaultSubAssignKernel( y, A, x, scalar );
3095 template<
typename VT1
3099 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3100 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3102 using boost::numeric_cast;
3108 const int M ( numeric_cast<int>( A.rows() ) );
3109 const int N ( numeric_cast<int>( A.columns() ) );
3110 const int lda( numeric_cast<int>( A.spacing() ) );
3112 cblas_sgemv( CblasRowMajor, CblasNoTrans, M, N, -scalar,
3113 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3133 template<
typename VT1
3137 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3138 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3140 using boost::numeric_cast;
3146 const int M ( numeric_cast<int>( A.rows() ) );
3147 const int N ( numeric_cast<int>( A.columns() ) );
3148 const int lda( numeric_cast<int>( A.spacing() ) );
3150 cblas_dgemv( CblasRowMajor, CblasNoTrans, M, N, -scalar,
3151 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3171 template<
typename VT1
3175 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3176 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3178 using boost::numeric_cast;
3187 const int M ( numeric_cast<int>( A.rows() ) );
3188 const int N ( numeric_cast<int>( A.columns() ) );
3189 const int lda( numeric_cast<int>( A.spacing() ) );
3190 const complex<float> alpha( -scalar );
3191 const complex<float> beta ( 1.0F, 0.0F );
3193 cblas_cgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
3194 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3214 template<
typename VT1
3218 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3219 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3221 using boost::numeric_cast;
3230 const int M ( numeric_cast<int>( A.rows() ) );
3231 const int N ( numeric_cast<int>( A.columns() ) );
3232 const int lda( numeric_cast<int>( A.spacing() ) );
3233 const complex<double> alpha( -scalar );
3234 const complex<double> beta ( 1.0, 0.0 );
3236 cblas_zgemv( CblasRowMajor, CblasNoTrans, M, N, &alpha,
3237 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3258 template<
typename VT1 >
3259 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3292 template<
typename VT1 >
3293 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3294 smpAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3300 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3301 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3303 if( left.rows() == 0UL ) {
3306 else if( left.columns() == 0UL ) {
3337 template<
typename VT1 >
3338 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3339 smpAssign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3368 template<
typename VT1 >
3369 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3370 smpAddAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3376 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3377 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3379 if( left.rows() == 0UL || left.columns() == 0UL ) {
3413 template<
typename VT1 >
3414 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3415 smpSubAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3421 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
3422 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
3424 if( left.rows() == 0UL || left.columns() == 0UL ) {
3458 template<
typename VT1 >
3459 friend inline typename EnableIf< UseSMPAssign<VT1> >::Type
3460 smpMultAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3532 template<
typename T1
3534 inline const typename DisableIf< IsMatMatMultExpr<T1>, DMatDVecMultExpr<T1,T2> >::Type
3540 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
3568 template<
typename T1
3571 inline const typename EnableIf< IsMatMatMultExpr<T1>, MultExprTrait<T1,T2> >::Type::Type
3576 return (~mat).leftOperand() * ( (~mat).
rightOperand() * vec );
3591 template<
typename MT,
typename VT,
bool AF >
3596 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type, VT >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4599
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:254
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:343
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4329
DMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the DMatDVecMultExpr class.
Definition: DMatDVecMultExpr.h:281
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:152
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:199
Expression object for dense matrix-dense vector multiplications.The DMatDVecMultExpr class represents...
Definition: DMatDVecMultExpr.h:104
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:110
void smpMultAssign(DenseVector< VT1, TF1 > &lhs, const Vector< VT2, TF2 > &rhs)
Default implementation of the SMP multiplication assignment of a vector to a dense vector...
Definition: DenseVector.h:179
int16_t sum(const sse_int16_t &a)
Returns the sum of all elements in the 16-bit integral intrinsic vector.
Definition: Reduction.h:62
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: DMatDVecMultExpr.h:377
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: DMatDVecMultExpr.h:243
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2408
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:251
Header file for the DenseVector base class.
const size_t SMP_DMATDVECMULT_THRESHOLD
SMP row-major dense matrix/dense vector multiplication threshold.This threshold specifies when a row-...
Definition: Thresholds.h:322
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
const DMatSerialExpr< MT, SO > serial(const DenseMatrix< MT, SO > &dm)
Forces the serial evaluation of the given dense matrix expression dm.
Definition: DMatSerialExpr.h:690
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
DMatDVecMultExpr< MT, VT > This
Type of this DMatDVecMultExpr instance.
Definition: DMatDVecMultExpr.h:242
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
size_t size() const
Returns the current size/dimension of the vector.
Definition: DMatDVecMultExpr.h:323
const size_t end_
End of the unrolled calculation loop.
Definition: DMatDVecMultExpr.h:399
Constraint on the data type.
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:115
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:122
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:112
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:253
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: DMatDVecMultExpr.h:260
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
Header file for the multiplication trait.
Header file for the IsDouble type trait.
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the IsMatMatMultExpr type trait class.
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:271
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:114
LeftOperand leftOperand() const
Returns the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:333
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: DMatDVecMultExpr.h:387
Constraint on the data type.
Base class for all matrix/vector multiplication expression templates.The MatVecMultExpr class serves ...
Definition: MatVecMultExpr.h:66
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: DMatDVecMultExpr.h:257
Constraints on the storage order of matrix types.
Constraint on the data type.
ResultType::ElementType ElementType
Resulting element type.
Definition: DMatDVecMultExpr.h:245
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2406
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:361
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: DMatDVecMultExpr.h:397
Header file for the EnableIf class template.
Header file for the serial shim.
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:92
Header file for the IsNumeric type trait.
Header file for the SubmatrixExprTrait class template.
#define BLAZE_CONSTRAINT_MUST_BE_ROW_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a row-major dense or sparse matrix t...
Definition: StorageOrder.h:81
System settings for the BLAS mode.
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
Base template for the MultTrait class.
Definition: MultTrait.h:141
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: DMatDVecMultExpr.h:398
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: DMatDVecMultExpr.h:355
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:301
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:111
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
const ResultType CompositeType
Data type for composite expression templates.
Definition: DMatDVecMultExpr.h:248
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:331
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: DMatDVecMultExpr.h:251
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: DMatDVecMultExpr.h:367
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: DMatDVecMultExpr.h:244
Header file for all intrinsic functionality.
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:250
Base class for all compute expression templates.The Computation class serves as a tag for all computa...
Definition: Computation.h:59
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: DMatDVecMultExpr.h:246
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2403
const ElementType ReturnType
Return type for expression template evaluations.
Definition: DMatDVecMultExpr.h:247
size_t columns(const Matrix< MT, SO > &m)
Returns the current number of columns of the matrix.
Definition: Matrix.h:170
Header file for basic type definitions.
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: DMatDVecMultExpr.h:296
Header file for the MatVecMultExpr base class.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
const size_t DMATDVECMULT_THRESHOLD
Row-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold bet...
Definition: Thresholds.h:57
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: DMatDVecMultExpr.h:113
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.