35 #ifndef _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
36 #define _BLAZE_MATH_EXPRESSIONS_TDMATDVECMULTEXPR_H_
44 #include <boost/cast.hpp>
103 template<
typename MT
105 class TDMatDVecMultExpr :
public DenseVector< TDMatDVecMultExpr<MT,VT>, false >
106 ,
private MatVecMultExpr
107 ,
private Computation
135 template<
typename T1,
typename T2,
typename T3 >
136 struct UseSMPAssignKernel {
137 enum { value = evaluateMatrix || evaluateVector };
148 template<
typename T1,
typename T2,
typename T3 >
149 struct UseSinglePrecisionKernel {
150 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
164 template<
typename T1,
typename T2,
typename T3 >
165 struct UseDoublePrecisionKernel {
166 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
180 template<
typename T1,
typename T2,
typename T3 >
181 struct UseSinglePrecisionComplexKernel {
182 typedef complex<float> Type;
183 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
197 template<
typename T1,
typename T2,
typename T3 >
198 struct UseDoublePrecisionComplexKernel {
199 typedef complex<double> Type;
200 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
213 template<
typename T1,
typename T2,
typename T3 >
214 struct UseDefaultKernel {
215 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3>::value &&
216 !UseDoublePrecisionKernel<T1,T2,T3>::value &&
217 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
218 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
229 template<
typename T1,
typename T2,
typename T3 >
230 struct UseVectorizedDefaultKernel {
231 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
265 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
271 enum { smpAssignable = !evaluateMatrix && !evaluateVector };
300 if(
mat_.columns() != 0UL ) {
302 for(
size_t j=1UL; j<
end_; j+=2UL ) {
305 if( end_ <
mat_.columns() ) {
353 template<
typename T >
355 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
365 template<
typename T >
367 return (
mat_.isAliased( alias ) ||
vec_.isAliased( alias ) );
377 return mat_.isAligned() &&
vec_.isAligned();
414 template<
typename VT1 >
421 if( rhs.mat_.rows() == 0UL ) {
424 else if( rhs.mat_.columns() == 0UL ) {
437 TDMatDVecMultExpr::selectAssignKernel( ~lhs, A, x );
453 template<
typename VT1
457 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
461 TDMatDVecMultExpr::selectDefaultAssignKernel( y, A, x );
463 TDMatDVecMultExpr::selectBlasAssignKernel( y, A, x );
479 template<
typename VT1
482 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
483 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
504 template<
typename VT1
507 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
508 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
510 const size_t M( A.rows() );
511 const size_t N( A.columns() );
514 const size_t iend( M &
size_t(-2) );
516 for(
size_t i=0UL; i<M; ++i ) {
517 y[i] = x[0UL] * A(i,0UL);
519 for(
size_t j=1UL; j<N; ++j ) {
520 for(
size_t i=0UL; i<iend; i+=2UL ) {
521 y[i ] += x[j] * A(i ,j);
522 y[i+1UL] += x[j] * A(i+1UL,j);
525 y[iend] += x[j] * A(iend,j);
546 template<
typename VT1
549 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
550 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
552 typedef IntrinsicTrait<ElementType> IT;
554 const size_t M( A.rows() );
555 const size_t N( A.columns() );
559 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
560 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
561 for(
size_t j=0UL; j<N; ++j ) {
563 xmm1 = xmm1 + A.load(i ,j) * x1;
564 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
565 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
566 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
567 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
568 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
569 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
570 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
573 y.store( i+IT::size , xmm2 );
574 y.store( i+IT::size*2UL, xmm3 );
575 y.store( i+IT::size*3UL, xmm4 );
576 y.store( i+IT::size*4UL, xmm5 );
577 y.store( i+IT::size*5UL, xmm6 );
578 y.store( i+IT::size*6UL, xmm7 );
579 y.store( i+IT::size*7UL, xmm8 );
581 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
583 for(
size_t j=0UL; j<N; ++j ) {
585 xmm1 = xmm1 + A.load(i ,j) * x1;
586 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
587 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
588 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
591 y.store( i+IT::size , xmm2 );
592 y.store( i+IT::size*2UL, xmm3 );
593 y.store( i+IT::size*3UL, xmm4 );
595 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
597 for(
size_t j=0UL; j<N; ++j ) {
599 xmm1 = xmm1 + A.load(i ,j) * x1;
600 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
601 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
604 y.store( i+IT::size , xmm2 );
605 y.store( i+IT::size*2UL, xmm3 );
607 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
609 for(
size_t j=0UL; j<N; ++j ) {
611 xmm1 = xmm1 + A.load(i ,j) * x1;
612 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
615 y.store( i+IT::size, xmm2 );
619 for(
size_t j=0UL; j<N; ++j ) {
620 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
642 template<
typename VT1
645 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
646 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
648 selectDefaultAssignKernel( y, A, x );
668 template<
typename VT1
671 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
672 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
674 using boost::numeric_cast;
680 const int M ( numeric_cast<int>( A.rows() ) );
681 const int N ( numeric_cast<int>( A.columns() ) );
682 const int lda( numeric_cast<int>( A.spacing() ) );
684 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
685 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
706 template<
typename VT1
709 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
710 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
712 using boost::numeric_cast;
718 const int M ( numeric_cast<int>( A.rows() ) );
719 const int N ( numeric_cast<int>( A.columns() ) );
720 const int lda( numeric_cast<int>( A.spacing() ) );
722 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
723 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
744 template<
typename VT1
747 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
748 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
750 using boost::numeric_cast;
759 const int M ( numeric_cast<int>( A.rows() ) );
760 const int N ( numeric_cast<int>( A.columns() ) );
761 const int lda( numeric_cast<int>( A.spacing() ) );
762 const complex<float> alpha( 1.0F, 0.0F );
763 const complex<float> beta ( 0.0F, 0.0F );
765 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
766 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
787 template<
typename VT1
790 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
791 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
793 using boost::numeric_cast;
802 const int M ( numeric_cast<int>( A.rows() ) );
803 const int N ( numeric_cast<int>( A.columns() ) );
804 const int lda( numeric_cast<int>( A.spacing() ) );
805 const complex<double> alpha( 1.0, 0.0 );
806 const complex<double> beta ( 0.0, 0.0 );
808 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
809 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
828 template<
typename VT1 >
858 template<
typename VT1 >
865 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
877 TDMatDVecMultExpr::selectAddAssignKernel( ~lhs, A, x );
893 template<
typename VT1
896 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
897 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
899 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
901 TDMatDVecMultExpr::selectDefaultAddAssignKernel( y, A, x );
903 TDMatDVecMultExpr::selectBlasAddAssignKernel( y, A, x );
919 template<
typename VT1
922 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
923 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
944 template<
typename VT1
947 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
948 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
950 const size_t M( A.rows() );
951 const size_t N( A.columns() );
954 const size_t iend( M &
size_t(-2) );
956 for(
size_t j=0UL; j<N; ++j ) {
957 for(
size_t i=0UL; i<iend; i+=2UL ) {
958 y[i ] += x[j] * A(i ,j);
959 y[i+1UL] += x[j] * A(i+1UL,j);
962 y[iend] += x[j] * A(iend,j);
983 template<
typename VT1
986 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
987 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
989 typedef IntrinsicTrait<ElementType> IT;
991 const size_t M( A.rows() );
992 const size_t N( A.columns() );
996 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1005 for(
size_t j=0UL; j<N; ++j ) {
1007 xmm1 = xmm1 + A.load(i ,j) * x1;
1008 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1009 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1010 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
1011 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
1012 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
1013 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
1014 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
1016 y.store( i , xmm1 );
1017 y.store( i+IT::size , xmm2 );
1018 y.store( i+IT::size*2UL, xmm3 );
1019 y.store( i+IT::size*3UL, xmm4 );
1020 y.store( i+IT::size*4UL, xmm5 );
1021 y.store( i+IT::size*5UL, xmm6 );
1022 y.store( i+IT::size*6UL, xmm7 );
1023 y.store( i+IT::size*7UL, xmm8 );
1025 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1030 for(
size_t j=0UL; j<N; ++j ) {
1032 xmm1 = xmm1 + A.load(i ,j) * x1;
1033 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1034 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1035 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
1037 y.store( i , xmm1 );
1038 y.store( i+IT::size , xmm2 );
1039 y.store( i+IT::size*2UL, xmm3 );
1040 y.store( i+IT::size*3UL, xmm4 );
1042 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1046 for(
size_t j=0UL; j<N; ++j ) {
1048 xmm1 = xmm1 + A.load(i ,j) * x1;
1049 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
1050 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
1052 y.store( i , xmm1 );
1053 y.store( i+IT::size , xmm2 );
1054 y.store( i+IT::size*2UL, xmm3 );
1056 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1059 for(
size_t j=0UL; j<N; ++j ) {
1061 xmm1 = xmm1 + A.load(i ,j) * x1;
1062 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
1064 y.store( i , xmm1 );
1065 y.store( i+IT::size, xmm2 );
1069 for(
size_t j=0UL; j<N; ++j ) {
1070 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
1092 template<
typename VT1
1095 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1096 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1098 selectDefaultAddAssignKernel( y, A, x );
1118 template<
typename VT1
1121 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1122 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1124 using boost::numeric_cast;
1130 const int M ( numeric_cast<int>( A.rows() ) );
1131 const int N ( numeric_cast<int>( A.columns() ) );
1132 const int lda( numeric_cast<int>( A.spacing() ) );
1134 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, 1.0F,
1135 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1156 template<
typename VT1
1159 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1160 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1162 using boost::numeric_cast;
1168 const int M ( numeric_cast<int>( A.rows() ) );
1169 const int N ( numeric_cast<int>( A.columns() ) );
1170 const int lda( numeric_cast<int>( A.spacing() ) );
1172 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, 1.0,
1173 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1194 template<
typename VT1
1197 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1198 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1200 using boost::numeric_cast;
1209 const int M ( numeric_cast<int>( A.rows() ) );
1210 const int N ( numeric_cast<int>( A.columns() ) );
1211 const int lda( numeric_cast<int>( A.spacing() ) );
1212 const complex<float> alpha( 1.0F, 0.0F );
1213 const complex<float> beta ( 1.0F, 0.0F );
1215 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1216 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1237 template<
typename VT1
1240 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1241 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1243 using boost::numeric_cast;
1252 const int M ( numeric_cast<int>( A.rows() ) );
1253 const int N ( numeric_cast<int>( A.columns() ) );
1254 const int lda( numeric_cast<int>( A.spacing() ) );
1255 const complex<double> alpha( 1.0, 0.0 );
1256 const complex<double> beta ( 1.0, 0.0 );
1258 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1259 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1282 template<
typename VT1 >
1289 if( rhs.mat_.rows() == 0UL || rhs.mat_.columns() == 0UL ) {
1301 TDMatDVecMultExpr::selectSubAssignKernel( ~lhs, A, x );
1317 template<
typename VT1
1320 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
1321 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1323 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
1325 TDMatDVecMultExpr::selectDefaultSubAssignKernel( y, A, x );
1327 TDMatDVecMultExpr::selectBlasSubAssignKernel( y, A, x );
1343 template<
typename VT1
1346 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2> >::Type
1347 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1368 template<
typename VT1
1371 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1372 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1374 const size_t M( A.rows() );
1375 const size_t N( A.columns() );
1378 const size_t iend( M &
size_t(-2) );
1380 for(
size_t j=0UL; j<N; ++j ) {
1381 for(
size_t i=0UL; i<iend; i+=2UL ) {
1382 y[i ] -= x[j] * A(i ,j);
1383 y[i+1UL] -= x[j] * A(i+1UL,j);
1386 y[iend] -= x[j] * A(iend,j);
1407 template<
typename VT1
1410 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2> >::Type
1411 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1413 typedef IntrinsicTrait<ElementType> IT;
1415 const size_t M( A.rows() );
1416 const size_t N( A.columns() );
1420 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
1429 for(
size_t j=0UL; j<N; ++j ) {
1431 xmm1 = xmm1 - A.load(i ,j) * x1;
1432 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1433 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1434 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1435 xmm5 = xmm5 - A.load(i+IT::size*4UL,j) * x1;
1436 xmm6 = xmm6 - A.load(i+IT::size*5UL,j) * x1;
1437 xmm7 = xmm7 - A.load(i+IT::size*6UL,j) * x1;
1438 xmm8 = xmm8 - A.load(i+IT::size*7UL,j) * x1;
1440 y.store( i , xmm1 );
1441 y.store( i+IT::size , xmm2 );
1442 y.store( i+IT::size*2UL, xmm3 );
1443 y.store( i+IT::size*3UL, xmm4 );
1444 y.store( i+IT::size*4UL, xmm5 );
1445 y.store( i+IT::size*5UL, xmm6 );
1446 y.store( i+IT::size*6UL, xmm7 );
1447 y.store( i+IT::size*7UL, xmm8 );
1449 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
1454 for(
size_t j=0UL; j<N; ++j ) {
1456 xmm1 = xmm1 - A.load(i ,j) * x1;
1457 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1458 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1459 xmm4 = xmm4 - A.load(i+IT::size*3UL,j) * x1;
1461 y.store( i , xmm1 );
1462 y.store( i+IT::size , xmm2 );
1463 y.store( i+IT::size*2UL, xmm3 );
1464 y.store( i+IT::size*3UL, xmm4 );
1466 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
1470 for(
size_t j=0UL; j<N; ++j ) {
1472 xmm1 = xmm1 - A.load(i ,j) * x1;
1473 xmm2 = xmm2 - A.load(i+IT::size ,j) * x1;
1474 xmm3 = xmm3 - A.load(i+IT::size*2UL,j) * x1;
1476 y.store( i , xmm1 );
1477 y.store( i+IT::size , xmm2 );
1478 y.store( i+IT::size*2UL, xmm3 );
1480 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
1483 for(
size_t j=0UL; j<N; ++j ) {
1485 xmm1 = xmm1 - A.load(i ,j) * x1;
1486 xmm2 = xmm2 - A.load(i+IT::size,j) * x1;
1488 y.store( i , xmm1 );
1489 y.store( i+IT::size, xmm2 );
1493 for(
size_t j=0UL; j<N; ++j ) {
1494 xmm1 = xmm1 - A.load(i,j) *
set( x[j] );
1516 template<
typename VT1
1519 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2> >::Type
1520 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1522 selectDefaultSubAssignKernel( y, A, x );
1542 template<
typename VT1
1545 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2> >::Type
1546 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1548 using boost::numeric_cast;
1554 const int M ( numeric_cast<int>( A.rows() ) );
1555 const int N ( numeric_cast<int>( A.columns() ) );
1556 const int lda( numeric_cast<int>( A.spacing() ) );
1558 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -1.0F,
1559 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
1580 template<
typename VT1
1583 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2> >::Type
1584 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1586 using boost::numeric_cast;
1592 const int M ( numeric_cast<int>( A.rows() ) );
1593 const int N ( numeric_cast<int>( A.columns() ) );
1594 const int lda( numeric_cast<int>( A.spacing() ) );
1596 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -1.0,
1597 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
1618 template<
typename VT1
1621 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1622 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1624 using boost::numeric_cast;
1633 const int M ( numeric_cast<int>( A.rows() ) );
1634 const int N ( numeric_cast<int>( A.columns() ) );
1635 const int lda( numeric_cast<int>( A.spacing() ) );
1636 const complex<float> alpha( -1.0F, 0.0F );
1637 const complex<float> beta ( 1.0F, 0.0F );
1639 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1640 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1661 template<
typename VT1
1664 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
1665 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x )
1667 using boost::numeric_cast;
1676 const int M ( numeric_cast<int>( A.rows() ) );
1677 const int N ( numeric_cast<int>( A.columns() ) );
1678 const int lda( numeric_cast<int>( A.spacing() ) );
1679 const complex<double> alpha( -1.0, 0.0 );
1680 const complex<double> beta ( 1.0, 0.0 );
1682 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
1683 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
1706 template<
typename VT1 >
1756 template<
typename MT
1760 :
public DenseVector< DVecScalarMultExpr< TDMatDVecMultExpr<MT,VT>, ST, false >, false >
1761 ,
private VecScalarMultExpr
1762 ,
private Computation
1766 typedef TDMatDVecMultExpr<MT,VT> MVM;
1778 enum { evaluateMatrix = ( IsComputation<MT>::value && IsSame<MET,VET>::value &&
1779 IsBlasCompatible<MET>::value ) || RequiresEvaluation<MT>::value };
1784 enum { evaluateVector = IsComputation<VT>::value || RequiresEvaluation<VT>::value };
1791 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1792 struct UseSMPAssignKernel {
1793 enum { value = evaluateMatrix || evaluateVector };
1802 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1803 struct UseSinglePrecisionKernel {
1804 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1805 IsFloat<typename T1::ElementType>::value &&
1806 IsFloat<typename T2::ElementType>::value &&
1807 IsFloat<typename T3::ElementType>::value &&
1808 !IsComplex<T4>::value };
1817 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1818 struct UseDoublePrecisionKernel {
1819 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1820 IsDouble<typename T1::ElementType>::value &&
1821 IsDouble<typename T2::ElementType>::value &&
1822 IsDouble<typename T3::ElementType>::value &&
1823 !IsComplex<T4>::value };
1832 template<
typename T1,
typename T2,
typename T3 >
1833 struct UseSinglePrecisionComplexKernel {
1834 typedef complex<float> Type;
1835 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1836 IsSame<typename T1::ElementType,Type>::value &&
1837 IsSame<typename T2::ElementType,Type>::value &&
1838 IsSame<typename T3::ElementType,Type>::value };
1847 template<
typename T1,
typename T2,
typename T3 >
1848 struct UseDoublePrecisionComplexKernel {
1849 typedef complex<double> Type;
1850 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1851 IsSame<typename T1::ElementType,Type>::value &&
1852 IsSame<typename T2::ElementType,Type>::value &&
1853 IsSame<typename T3::ElementType,Type>::value };
1861 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1862 struct UseDefaultKernel {
1863 enum { value = !
BLAZE_BLAS_MODE || ( !UseSinglePrecisionKernel<T1,T2,T3,T4>::value &&
1864 !UseDoublePrecisionKernel<T1,T2,T3,T4>::value &&
1865 !UseSinglePrecisionComplexKernel<T1,T2,T3>::value &&
1866 !UseDoublePrecisionComplexKernel<T1,T2,T3>::value ) };
1875 template<
typename T1,
typename T2,
typename T3,
typename T4 >
1876 struct UseVectorizedDefaultKernel {
1877 enum { value = T1::vectorizable && T2::vectorizable && T3::vectorizable &&
1878 IsSame<typename T1::ElementType,typename T2::ElementType>::value &&
1879 IsSame<typename T1::ElementType,typename T3::ElementType>::value &&
1880 IsSame<typename T1::ElementType,T4>::value &&
1881 IntrinsicTrait<typename T1::ElementType>::addition &&
1882 IntrinsicTrait<typename T1::ElementType>::multiplication };
1888 typedef DVecScalarMultExpr<MVM,ST,false>
This;
1889 typedef typename MultTrait<RES,ST>::Type
ResultType;
1892 typedef typename IntrinsicTrait<ElementType>::Type
IntrinsicType;
1897 typedef const TDMatDVecMultExpr<MT,VT>
LeftOperand;
1903 typedef typename SelectType< evaluateMatrix, const MRT, MCT >::Type
LT;
1906 typedef typename SelectType< evaluateVector, const VRT, VCT >::Type
RT;
1911 enum { vectorizable = MT::vectorizable && VT::vectorizable &&
1912 IsSame<MET,VET>::value &&
1913 IsSame<MET,ST>::value &&
1914 IntrinsicTrait<MET>::addition &&
1915 IntrinsicTrait<MET>::multiplication };
1918 enum { smpAssignable = !evaluateMatrix && !evaluateVector };
1927 explicit inline DVecScalarMultExpr(
const MVM& vector, ST scalar )
1941 return vector_[index] * scalar_;
1950 inline size_t size()
const {
1951 return vector_.size();
1981 template<
typename T >
1982 inline bool canAlias(
const T* alias )
const {
1983 return vector_.canAlias( alias );
1993 template<
typename T >
1994 inline bool isAliased(
const T* alias )
const {
1995 return vector_.isAliased( alias );
2005 return vector_.isAligned();
2015 typename MVM::LeftOperand A( vector_.leftOperand() );
2017 ( IsComputation<MT>::value && !evaluateMatrix ) ||
2041 template<
typename VT1 >
2042 friend inline void assign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2048 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2049 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2051 if( left.rows() == 0UL ) {
2054 else if( left.columns() == 0UL ) {
2067 DVecScalarMultExpr::selectAssignKernel( ~lhs, A, x, rhs.scalar_ );
2082 template<
typename VT1
2086 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2087 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2089 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2091 DVecScalarMultExpr::selectDefaultAssignKernel( y, A, x, scalar );
2093 DVecScalarMultExpr::selectBlasAssignKernel( y, A, x, scalar );
2108 template<
typename VT1
2112 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2113 selectAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2133 template<
typename VT1
2137 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2138 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2140 const size_t M( A.rows() );
2141 const size_t N( A.columns() );
2144 const size_t iend( M &
size_t(-2) );
2146 for(
size_t i=0UL; i<M; ++i ) {
2147 y[i] = x[0UL] * A(i,0UL);
2149 for(
size_t j=1UL; j<N; ++j ) {
2150 for(
size_t i=0UL; i<iend; i+=2UL ) {
2151 y[i ] += x[j] * A(i ,j);
2152 y[i+1UL] += x[j] * A(i+1UL,j);
2155 y[iend] += x[j] * A(iend,j);
2158 for(
size_t i=0UL; i<M; ++i ) {
2178 template<
typename VT1
2182 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2183 selectDefaultAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2185 typedef IntrinsicTrait<ElementType> IT;
2187 const size_t M( A.rows() );
2188 const size_t N( A.columns() );
2194 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2195 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2196 for(
size_t j=0UL; j<N; ++j ) {
2198 xmm1 = xmm1 + A.load(i ,j) * x1;
2199 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2200 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2201 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2202 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2203 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2204 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2205 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2207 y.store( i , xmm1*factor );
2208 y.store( i+IT::size , xmm2*factor );
2209 y.store( i+IT::size*2UL, xmm3*factor );
2210 y.store( i+IT::size*3UL, xmm4*factor );
2211 y.store( i+IT::size*4UL, xmm5*factor );
2212 y.store( i+IT::size*5UL, xmm6*factor );
2213 y.store( i+IT::size*6UL, xmm7*factor );
2214 y.store( i+IT::size*7UL, xmm8*factor );
2216 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2218 for(
size_t j=0UL; j<N; ++j ) {
2220 xmm1 = xmm1 + A.load(i ,j) * x1;
2221 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2222 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2223 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2225 y.store( i , xmm1*factor );
2226 y.store( i+IT::size , xmm2*factor );
2227 y.store( i+IT::size*2UL, xmm3*factor );
2228 y.store( i+IT::size*3UL, xmm4*factor );
2230 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2232 for(
size_t j=0UL; j<N; ++j ) {
2234 xmm1 = xmm1 + A.load(i ,j) * x1;
2235 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2236 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2238 y.store( i , xmm1*factor );
2239 y.store( i+IT::size , xmm2*factor );
2240 y.store( i+IT::size*2UL, xmm3*factor );
2242 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2244 for(
size_t j=0UL; j<N; ++j ) {
2246 xmm1 = xmm1 + A.load(i ,j) * x1;
2247 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2249 y.store( i , xmm1*factor );
2250 y.store( i+IT::size, xmm2*factor );
2254 for(
size_t j=0UL; j<N; ++j ) {
2256 xmm1 = xmm1 + A.load(i,j) * x1;
2258 y.store( i, xmm1*factor );
2277 template<
typename VT1
2281 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2282 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2284 selectDefaultAssignKernel( y, A, x, scalar );
2303 template<
typename VT1
2307 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2308 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2310 using boost::numeric_cast;
2316 const int M ( numeric_cast<int>( A.rows() ) );
2317 const int N ( numeric_cast<int>( A.columns() ) );
2318 const int lda( numeric_cast<int>( A.spacing() ) );
2320 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2321 A.data(), lda, x.data(), 1, 0.0F, y.data(), 1 );
2341 template<
typename VT1
2345 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2346 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2348 using boost::numeric_cast;
2354 const int M ( numeric_cast<int>( A.rows() ) );
2355 const int N ( numeric_cast<int>( A.columns() ) );
2356 const int lda( numeric_cast<int>( A.spacing() ) );
2358 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2359 A.data(), lda, x.data(), 1, 0.0, y.data(), 1 );
2379 template<
typename VT1
2383 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2384 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2386 using boost::numeric_cast;
2395 const int M ( numeric_cast<int>( A.rows() ) );
2396 const int N ( numeric_cast<int>( A.columns() ) );
2397 const int lda( numeric_cast<int>( A.spacing() ) );
2398 const complex<float> alpha( scalar );
2399 const complex<float> beta ( 0.0F, 0.0F );
2401 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2402 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2422 template<
typename VT1
2426 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2427 selectBlasAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2429 using boost::numeric_cast;
2438 const int M ( numeric_cast<int>( A.rows() ) );
2439 const int N ( numeric_cast<int>( A.columns() ) );
2440 const int lda( numeric_cast<int>( A.spacing() ) );
2441 const complex<double> alpha( scalar );
2442 const complex<double> beta ( 0.0, 0.0 );
2444 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2445 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2462 template<
typename VT1 >
2463 friend inline void assign( SparseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2490 template<
typename VT1 >
2491 friend inline void addAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2497 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2498 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2500 if( left.rows() == 0UL || left.columns() == 0UL ) {
2512 DVecScalarMultExpr::selectAddAssignKernel( ~lhs, A, x, rhs.scalar_ );
2527 template<
typename VT1
2531 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2532 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2534 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2536 DVecScalarMultExpr::selectDefaultAddAssignKernel( y, A, x, scalar );
2538 DVecScalarMultExpr::selectBlasAddAssignKernel( y, A, x, scalar );
2553 template<
typename VT1
2557 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2558 selectAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2578 template<
typename VT1
2582 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2583 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2585 y.addAssign( A * x * scalar );
2603 template<
typename VT1
2607 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2608 selectDefaultAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2610 typedef IntrinsicTrait<ElementType> IT;
2612 const size_t M( A.rows() );
2613 const size_t N( A.columns() );
2619 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
2620 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
2621 for(
size_t j=0UL; j<N; ++j ) {
2623 xmm1 = xmm1 + A.load(i ,j) * x1;
2624 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2625 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2626 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2627 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
2628 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
2629 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
2630 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
2632 y.store( i , y.load(i ) + xmm1*factor );
2633 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2634 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2635 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2636 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) + xmm5*factor );
2637 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) + xmm6*factor );
2638 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) + xmm7*factor );
2639 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) + xmm8*factor );
2641 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
2643 for(
size_t j=0UL; j<N; ++j ) {
2645 xmm1 = xmm1 + A.load(i ,j) * x1;
2646 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2647 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2648 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
2650 y.store( i , y.load(i ) + xmm1*factor );
2651 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2652 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2653 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) + xmm4*factor );
2655 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
2657 for(
size_t j=0UL; j<N; ++j ) {
2659 xmm1 = xmm1 + A.load(i ,j) * x1;
2660 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
2661 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
2663 y.store( i , y.load(i ) + xmm1*factor );
2664 y.store( i+IT::size , y.load(i+IT::size ) + xmm2*factor );
2665 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) + xmm3*factor );
2667 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
2669 for(
size_t j=0UL; j<N; ++j ) {
2671 xmm1 = xmm1 + A.load(i ,j) * x1;
2672 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
2674 y.store( i , y.load(i ) + xmm1*factor );
2675 y.store( i+IT::size, y.load(i+IT::size) + xmm2*factor );
2679 for(
size_t j=0UL; j<N; ++j ) {
2680 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
2682 y.store( i, y.load(i) + xmm1*factor );
2701 template<
typename VT1
2705 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2706 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2708 selectDefaultAddAssignKernel( y, A, x, scalar );
2727 template<
typename VT1
2731 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2732 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2734 using boost::numeric_cast;
2740 const int M ( numeric_cast<int>( A.rows() ) );
2741 const int N ( numeric_cast<int>( A.columns() ) );
2742 const int lda( numeric_cast<int>( A.spacing() ) );
2744 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2745 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
2765 template<
typename VT1
2769 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
2770 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2772 using boost::numeric_cast;
2778 const int M ( numeric_cast<int>( A.rows() ) );
2779 const int N ( numeric_cast<int>( A.columns() ) );
2780 const int lda( numeric_cast<int>( A.spacing() ) );
2782 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, scalar,
2783 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
2803 template<
typename VT1
2807 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2808 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2810 using boost::numeric_cast;
2819 const int M ( numeric_cast<int>( A.rows() ) );
2820 const int N ( numeric_cast<int>( A.columns() ) );
2821 const int lda( numeric_cast<int>( A.spacing() ) );
2822 const complex<float> alpha( scalar );
2823 const complex<float> beta ( 1.0F, 0.0F );
2825 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2826 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2846 template<
typename VT1
2850 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
2851 selectBlasAddAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2853 using boost::numeric_cast;
2862 const int M ( numeric_cast<int>( A.rows() ) );
2863 const int N ( numeric_cast<int>( A.columns() ) );
2864 const int lda( numeric_cast<int>( A.spacing() ) );
2865 const complex<double> alpha( scalar );
2866 const complex<double> beta ( 1.0, 0.0 );
2868 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
2869 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
2890 template<
typename VT1 >
2891 friend inline void subAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
2897 typename MVM::LeftOperand left ( rhs.vector_.leftOperand() );
2898 typename MVM::RightOperand right( rhs.vector_.rightOperand() );
2900 if( left.rows() == 0UL || left.columns() == 0UL ) {
2912 DVecScalarMultExpr::selectSubAssignKernel( ~lhs, A, x, rhs.scalar_ );
2927 template<
typename VT1
2931 static inline typename DisableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2932 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2934 if( ( IsComputation<MT>::value && !evaluateMatrix ) ||
2936 DVecScalarMultExpr::selectDefaultSubAssignKernel( y, A, x, scalar );
2938 DVecScalarMultExpr::selectBlasSubAssignKernel( y, A, x, scalar );
2953 template<
typename VT1
2957 static inline typename EnableIf< UseSMPAssignKernel<VT1,MT1,VT2,ST2> >::Type
2958 selectSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2978 template<
typename VT1
2982 static inline typename DisableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
2983 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
2985 y.subAssign( A * x * scalar );
3003 template<
typename VT1
3007 static inline typename EnableIf< UseVectorizedDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3008 selectDefaultSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3010 typedef IntrinsicTrait<ElementType> IT;
3012 const size_t M( A.rows() );
3013 const size_t N( A.columns() );
3019 for( ; (i+IT::size*7UL) < M; i+=IT::size*8UL ) {
3020 IntrinsicType xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
3021 for(
size_t j=0UL; j<N; ++j ) {
3023 xmm1 = xmm1 + A.load(i ,j) * x1;
3024 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3025 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3026 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
3027 xmm5 = xmm5 + A.load(i+IT::size*4UL,j) * x1;
3028 xmm6 = xmm6 + A.load(i+IT::size*5UL,j) * x1;
3029 xmm7 = xmm7 + A.load(i+IT::size*6UL,j) * x1;
3030 xmm8 = xmm8 + A.load(i+IT::size*7UL,j) * x1;
3032 y.store( i , y.load(i ) - xmm1*factor );
3033 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3034 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3035 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
3036 y.store( i+IT::size*4UL, y.load(i+IT::size*4UL) - xmm5*factor );
3037 y.store( i+IT::size*5UL, y.load(i+IT::size*5UL) - xmm6*factor );
3038 y.store( i+IT::size*6UL, y.load(i+IT::size*6UL) - xmm7*factor );
3039 y.store( i+IT::size*7UL, y.load(i+IT::size*7UL) - xmm8*factor );
3041 for( ; (i+IT::size*3UL) < M; i+=IT::size*4UL ) {
3043 for(
size_t j=0UL; j<N; ++j ) {
3045 xmm1 = xmm1 + A.load(i ,j) * x1;
3046 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3047 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3048 xmm4 = xmm4 + A.load(i+IT::size*3UL,j) * x1;
3050 y.store( i , y.load(i ) - xmm1*factor );
3051 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3052 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3053 y.store( i+IT::size*3UL, y.load(i+IT::size*3UL) - xmm4*factor );
3055 for( ; (i+IT::size*2UL) < M; i+=IT::size*3UL ) {
3057 for(
size_t j=0UL; j<N; ++j ) {
3059 xmm1 = xmm1 + A.load(i ,j) * x1;
3060 xmm2 = xmm2 + A.load(i+IT::size ,j) * x1;
3061 xmm3 = xmm3 + A.load(i+IT::size*2UL,j) * x1;
3063 y.store( i , y.load(i ) - xmm1*factor );
3064 y.store( i+IT::size , y.load(i+IT::size ) - xmm2*factor );
3065 y.store( i+IT::size*2UL, y.load(i+IT::size*2UL) - xmm3*factor );
3067 for( ; (i+IT::size) < M; i+=IT::size*2UL ) {
3069 for(
size_t j=0UL; j<N; ++j ) {
3071 xmm1 = xmm1 + A.load(i ,j) * x1;
3072 xmm2 = xmm2 + A.load(i+IT::size,j) * x1;
3074 y.store( i , y.load(i ) - xmm1*factor );
3075 y.store( i+IT::size, y.load(i+IT::size) - xmm2*factor );
3079 for(
size_t j=0UL; j<N; ++j ) {
3080 xmm1 = xmm1 + A.load(i,j) *
set( x[j] );
3082 y.store( i, y.load(i) - xmm1*factor );
3101 template<
typename VT1
3105 static inline typename EnableIf< UseDefaultKernel<VT1,MT1,VT2,ST2> >::Type
3106 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3108 selectDefaultSubAssignKernel( y, A, x, scalar );
3127 template<
typename VT1
3131 static inline typename EnableIf< UseSinglePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3132 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3134 using boost::numeric_cast;
3140 const int M ( numeric_cast<int>( A.rows() ) );
3141 const int N ( numeric_cast<int>( A.columns() ) );
3142 const int lda( numeric_cast<int>( A.spacing() ) );
3144 cblas_sgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
3145 A.data(), lda, x.data(), 1, 1.0F, y.data(), 1 );
3165 template<
typename VT1
3169 static inline typename EnableIf< UseDoublePrecisionKernel<VT1,MT1,VT2,ST2> >::Type
3170 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3172 using boost::numeric_cast;
3178 const int M ( numeric_cast<int>( A.rows() ) );
3179 const int N ( numeric_cast<int>( A.columns() ) );
3180 const int lda( numeric_cast<int>( A.spacing() ) );
3182 cblas_dgemv( CblasColMajor, CblasNoTrans, M, N, -scalar,
3183 A.data(), lda, x.data(), 1, 1.0, y.data(), 1 );
3203 template<
typename VT1
3207 static inline typename EnableIf< UseSinglePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3208 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3210 using boost::numeric_cast;
3219 const int M ( numeric_cast<int>( A.rows() ) );
3220 const int N ( numeric_cast<int>( A.columns() ) );
3221 const int lda( numeric_cast<int>( A.spacing() ) );
3222 const complex<float> alpha( -scalar );
3223 const complex<float> beta ( 1.0F, 0.0F );
3225 cblas_cgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
3226 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3246 template<
typename VT1
3250 static inline typename EnableIf< UseDoublePrecisionComplexKernel<VT1,MT1,VT2> >::Type
3251 selectBlasSubAssignKernel( VT1& y,
const MT1& A,
const VT2& x, ST2 scalar )
3253 using boost::numeric_cast;
3262 const int M ( numeric_cast<int>( A.rows() ) );
3263 const int N ( numeric_cast<int>( A.columns() ) );
3264 const int lda( numeric_cast<int>( A.spacing() ) );
3265 const complex<double> alpha( -scalar );
3266 const complex<double> beta ( 1.0, 0.0 );
3268 cblas_zgemv( CblasColMajor, CblasNoTrans, M, N, &alpha,
3269 A.data(), lda, x.data(), 1, &beta, y.data(), 1 );
3290 template<
typename VT1 >
3291 friend inline void multAssign( DenseVector<VT1,false>& lhs,
const DVecScalarMultExpr& rhs )
3364 template<
typename T1
3366 inline const typename DisableIf< IsMatMatMultExpr<T1>, TDMatDVecMultExpr<T1,T2> >::Type
3372 throw std::invalid_argument(
"Matrix and vector sizes do not match" );
3389 template<
typename MT,
typename VT,
bool AF >
3394 typedef typename MultExprTrait< typename SubmatrixExprTrait<const MT,AF>::Type, VT >::Type Type;
Compile time check whether the given type is a computational expression template.This type trait clas...
Definition: IsComputation.h:89
void reset(DynamicMatrix< Type, SO > &m)
Resetting the given dense matrix.
Definition: DynamicMatrix.h:4579
#define BLAZE_BLAS_IS_PARALLEL
Compilation switch for the parallel BLAS mode.This compilation switch specifies whether the used BLAS...
Definition: BLAS.h:86
const DMatDMatMultExpr< T1, T2 > operator*(const DenseMatrix< T1, false > &lhs, const DenseMatrix< T2, false > &rhs)
Multiplication operator for the multiplication of two row-major dense matrices ( ).
Definition: DMatDMatMultExpr.h:4075
SelectType< evaluateMatrix, const MRT, MCT >::Type LT
Type for the assignment of the left-hand side dense matrix operand.
Definition: TDMatDVecMultExpr.h:257
VT::ResultType VRT
Result type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:112
void smpSubAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP subtraction assignment of a matrix to dense matrix.
Definition: DenseMatrix.h:151
SelectType< IsExpression< MT >::value, const MT, const MT & >::Type LeftOperand
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:251
TDMatDVecMultExpr(const MT &mat, const VT &vec)
Constructor for the TDMatDVecMultExpr class.
Definition: TDMatDVecMultExpr.h:280
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a column dense or sparse vector type...
Definition: TransposeFlag.h:159
Efficient implementation of a compressed matrix.The CompressedMatrix class template is the represent...
Definition: CompressedMatrix.h:197
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional matrix type...
Definition: DenseMatrix.h:79
Header file for the IsSame and IsStrictlySame type traits.
Constraint on the data type.
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:2384
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:249
Header file for the DenseVector base class.
ResultType::TransposeType TransposeType
Transpose type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:244
Compile time check for double precision floating point types.This type trait tests whether or not the...
Definition: IsDouble.h:75
Header file for the Computation base class.
Type relationship analysis.This class tests if the two data types A and B are equal. For this type comparison, the cv-qualifiers of both data types are ignored. If A and B are the same data type (ignoring the cv-qualifiers), then the value member enumeration is set to 1, the nested type definition Type is TrueType, and the class derives from TrueType. Otherwise value is set to 0, Type is FalseType, and the class derives from FalseType.
Definition: IsSame.h:158
Header file for the RequiresEvaluation type trait.
LeftOperand mat_
Left-hand side dense matrix of the multiplication expression.
Definition: TDMatDVecMultExpr.h:396
Header file for the VecScalarMultExpr base class.
Header file for the IsFloat type trait.
Base class for dense matrices.The DenseMatrix class is a base class for all dense matrix classes...
Definition: DenseMatrix.h:70
Expression object for transpose dense matrix-dense vector multiplications.The TDMatDVecMultExpr class...
Definition: Forward.h:122
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_FLOAT_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Float.h:80
Constraint on the data type.
Constraint on the data type.
Header file for the MultExprTrait class template.
void smpAddAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP addition assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:121
Compile time check to query the requirement to evaluate an expression.Via this type trait it is possi...
Definition: RequiresEvaluation.h:90
const This & CompositeType
Data type for composite expression templates.
Definition: CompressedMatrix.h:251
Compile time type selection.The SelectType class template selects one of the two given types T1 and T...
Definition: SelectType.h:59
Header file for the DisableIf class template.
SelectType< IsExpression< VT >::value, const VT, const VT & >::Type RightOperand
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:254
Header file for the multiplication trait.
Header file for the dense vector SMP implementation.
Header file for the IsDouble type trait.
RightOperand vec_
Right-hand side dense vector of the multiplication expression.
Definition: TDMatDVecMultExpr.h:397
#define BLAZE_CONSTRAINT_MUST_BE_COLUMN_MAJOR_MATRIX_TYPE(T)
Constraint on the data type.In case the given data type T is not a column-major dense or sparse matri...
Definition: StorageOrder.h:161
#define BLAZE_CONSTRAINT_MUST_BE_DOUBLE_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is of type...
Definition: Double.h:80
Header file for the IsMatMatMultExpr type trait class.
MT::ResultType MRT
Result type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:111
Header file for the IsBlasCompatible type trait.
void assign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the assignment of a matrix to a matrix.
Definition: Matrix.h:179
LeftOperand leftOperand() const
Returns the left-hand side transpose dense matrix operand.
Definition: TDMatDVecMultExpr.h:332
Base class for N-dimensional dense vectors.The DenseVector class is a base class for all arbitrarily ...
Definition: DenseVector.h:70
#define BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(A, B)
Data type constraint.In case the two types A and B are not the same (ignoring all cv-qualifiers of bo...
Definition: SameType.h:89
const size_t end_
End of the unrolled calculation loop.
Definition: TDMatDVecMultExpr.h:398
#define BLAZE_BLAS_MODE
Compilation switch for the BLAS mode.This compilation switch enables/disables the BLAS mode...
Definition: BLAS.h:65
Constraint on the data type.
#define BLAZE_CONSTRAINT_MUST_BE_REFERENCE_TYPE(T)
Constraint on the data type.In case the given data type T is not a reference type, a compilation error is created.
Definition: Reference.h:78
const ElementType ReturnType
Return type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:247
Constraints on the storage order of matrix types.
Constraint on the data type.
const ResultType CompositeType
Data type for composite expression templates.
Definition: TDMatDVecMultExpr.h:248
Type ElementType
Type of the sparse matrix elements.
Definition: CompressedMatrix.h:2382
void multAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the multiplication assignment of a matrix to a matrix.
Definition: Matrix.h:269
Header file for the SelectType class template.
Header file for all forward declarations for expression class templates.
SelectType< evaluateVector, const VRT, VCT >::Type RT
Type for the assignment of the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:260
Header file for the EnableIf class template.
size_t size() const
Returns the current size/dimension of the vector.
Definition: TDMatDVecMultExpr.h:322
void smpAssign(DenseMatrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the SMP assignment of a matrix to a dense matrix.
Definition: DenseMatrix.h:91
Header file for the IsNumeric type trait.
bool isAliased(const T *alias) const
Returns whether the expression is aliased with the given address alias.
Definition: TDMatDVecMultExpr.h:366
MRT::ElementType MET
Element type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:113
Header file for the SubmatrixExprTrait class template.
System settings for the BLAS mode.
MultTrait< MRT, VRT >::Type ResultType
Result type for expression template evaluations.
Definition: TDMatDVecMultExpr.h:243
IntrinsicTrait< ElementType >::Type IntrinsicType
Resulting intrinsic element type.
Definition: TDMatDVecMultExpr.h:246
Intrinsic characteristics of data types.The IntrinsicTrait class template provides the intrinsic char...
Definition: IntrinsicTrait.h:748
Header file for run time assertion macros.
const size_t TDMATDVECMULT_THRESHOLD
Column-major dense matrix/dense vector multiplication threshold.This setting specifies the threshold ...
Definition: Thresholds.h:74
Base template for the MultTrait class.
Definition: MultTrait.h:141
void addAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the addition assignment of a matrix to a matrix.
Definition: Matrix.h:209
bool canAlias(const T *alias) const
Returns whether the expression can alias with the given address alias.
Definition: TDMatDVecMultExpr.h:354
#define BLAZE_CONSTRAINT_MUST_BE_NUMERIC_TYPE(T)
Constraint on the data type.In case the given data type T is not a numeric (integral or floating poin...
Definition: Numeric.h:79
Header file for the reset shim.
Compile time check for data types.This type trait tests whether or not the given template parameter i...
Definition: IsBlasCompatible.h:99
void subAssign(Matrix< MT1, SO1 > &lhs, const Matrix< MT2, SO2 > &rhs)
Default implementation of the subtraction assignment of a matrix to matrix.
Definition: Matrix.h:239
Substitution Failure Is Not An Error (SFINAE) class.The DisableIf class template is an auxiliary tool...
Definition: DisableIf.h:184
VRT::ElementType VET
Element type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:114
Header file for all intrinsic functionality.
ResultType::ElementType ElementType
Resulting element type.
Definition: TDMatDVecMultExpr.h:245
ReturnType operator[](size_t index) const
Subscript operator for the direct access to the vector elements.
Definition: TDMatDVecMultExpr.h:295
#define BLAZE_CONSTRAINT_MUST_BE_DENSE_VECTOR_TYPE(T)
Constraint on the data type.In case the given data type T is not a dense, N-dimensional vector type...
Definition: DenseVector.h:79
#define BLAZE_CONSTRAINT_MUST_BE_COMPLEX_TYPE(T)
Constraint on the data type.This compile time constraint checks that the given data type T is a compl...
Definition: Complex.h:80
TDMatDVecMultExpr< MT, VT > This
Type of this TDMatDVecMultExpr instance.
Definition: TDMatDVecMultExpr.h:242
Header file for the IsComputation type trait class.
CompressedMatrix< Type,!SO > TransposeType
Transpose type for expression template evaluations.
Definition: CompressedMatrix.h:248
Header file for the sparse vector SMP implementation.
#define BLAZE_FUNCTION_TRACE
Function trace macro.This macro can be used to reliably trace function calls. In case function tracin...
Definition: FunctionTrace.h:157
This ResultType
Result type for expression template evaluations.
Definition: CompressedMatrix.h:2379
size_t columns(const Matrix< MT, SO > &m)
Returns the current number of columns of the matrix.
Definition: Matrix.h:154
Header file for basic type definitions.
VT::CompositeType VCT
Composite type of the right-hand side dense vector expression.
Definition: TDMatDVecMultExpr.h:116
Header file for the IsComplex type trait.
Header file for the SubvectorExprTrait class template.
Header file for the complex data type.
Header file for the MatVecMultExpr base class.
Compile time check for single precision floating point types.This type trait tests whether or not the...
Definition: IsFloat.h:75
bool canSMPAssign() const
Returns whether the expression can be used in SMP assignments.
Definition: TDMatDVecMultExpr.h:386
bool isAligned() const
Returns whether the operands of the expression are properly aligned in memory.
Definition: TDMatDVecMultExpr.h:376
Constraint on the data type.
Size type of the Blaze library.
Header file for the thresholds for matrix/vector and matrix/matrix multiplications.
#define BLAZE_INTERNAL_ASSERT(expr, msg)
Run time assertion macro for internal checks.In case of an invalid run time expression, the program execution is terminated. The BLAZE_INTERNAL_ASSERT macro can be disabled by setting the BLAZE_USER_ASSERTION flag to zero or by defining NDEBUG during the compilation.
Definition: Assert.h:101
RightOperand rightOperand() const
Returns the right-hand side dense vector operand.
Definition: TDMatDVecMultExpr.h:342
MT::CompositeType MCT
Composite type of the left-hand side dense matrix expression.
Definition: TDMatDVecMultExpr.h:115
EnableIf< IsIntegral< T >, Set< T, sizeof(T)> >::Type::Type set(T value)
Sets all values in the vector to the given integral value.
Definition: Set.h:209
Header file for the IsExpression type trait class.
Header file for the FunctionTrace class.
const size_t SMP_TDMATDVECMULT_THRESHOLD
SMP column-major dense matrix/dense vector multiplication threshold.This threshold represents the sys...
Definition: Thresholds.h:152